Here, I analyse and document my progress with the analysis of a world-wide whole genome sampling of Zymoseptoria tritici. Some of the analysis are just exploratory while some other are lined up in a clear path to answering questions related to the history of Z.tritici and to better understand its adaptation to various climates.
library(knitr)
library(reticulate)
#Spatial analyses packages
library(raster)
library(sp)
library(rgdal)
library(maps)
library(geosphere)
#Data wrangling and data viz
library(purrr)
library(RColorBrewer)
library(plotly)
library(cowplot)
library(GGally)
library(corrplot)
library(pheatmap)
library(ggstance)
library('pophelper')
library(ggbiplot)
library(igraph)
library(ggraph)
library(ggtext)
library(scatterpie)
library(tidyverse)
#Pop structure and pop genomic
library(GenomicFeatures)
library(SNPRelate) #PCA
library(LEA) #Clustering
library(pophelper)
#library(PopGenome) #Summary statistics
library(gridExtra)
library(ggExtra)
library(ggtree)
library(tidytree)
library(multcomp)
library(lsmeans)
#GEA
library(lfmm)
#Statistics
library(car)
library(corrr)
library(lsmeans)
library(multcomp)
library(caret)
library(mgcv)
#Variables
world <- map_data("world")
project_dir="~/Documents/Postdoc_Bruce/Projects/WW_project/"
lists_dir = "~/Documents/Postdoc_Bruce/Projects/WW_project/WW_PopGen/Keep_lists_samples/"
#Data directories
data_dir=paste0(project_dir, "0_Data/")
metadata_dir=paste0(project_dir, "Metadata/")
fig_dir = "~/Documents/Postdoc_Bruce/Manuscripts/Feurtey_WW_Zt/Draft_figures/"
#Analysis directories
#-___________________
VAR_dir = paste0(project_dir, "1_Variant_calling/")
depth_per_window_dir = paste0(VAR_dir, "1_Depth_per_window/")
vcf_dir = paste0(VAR_dir, "4_Joint_calling/")
mito_SV = paste0(VAR_dir, "6_Mito_SV/")
PopStr_dir = paste0(project_dir, "2_Population_structure/")
nuc_PS_dir=paste0(PopStr_dir, "0_Nuclear_genome/")
mito_PS_dir = paste0(PopStr_dir, "1_Mitochondrial_genome/")
Sumstats_dir = paste0(project_dir, "3_Sumstats_demography/")
TE_RIP_dir=paste0(project_dir, "4_TE_RIP/")
RIP_DIR=paste0(TE_RIP_dir, "0_RIP_estimation/")
DIM2_DIR=paste0(TE_RIP_dir, "1_Blast_from_denovo_assemblies/")
GEA_dir=paste0(project_dir, "5_GEA/")
fung_dir=paste0(project_dir, "6_Fungicide_resistance/")
virulence_dir = paste0(project_dir, "7_Virulence/")
sel_dir = paste0(project_dir, "8_Selection/")
gene_list_dir = paste0(sel_dir, "0_Lists_unique_copy/")
#Files
vcf_name="Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp"
vcf_name_nomaf="Ztritici_global_March2021.filtered-clean.AB_filtered.SNP.max-m-0.8.thin-1000bp"
vcf_name_mito = "Ztritici_global_March2021.genotyped.mt.filtered.clean.AB_filtered.variants.good_samples.max-m-80"
Zt_list = paste0(lists_dir, "Ztritici_global_March2021.genotyped.good_samples.args")
gff_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.Grandaubert2015.no_CDS.gff3")
ref_fasta_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa")
metadata_name = "Main_table_from_SQL_Feb_2020"
gene_annotation = read_tsv(paste0(data_dir, "Badet_GLOBAL_PANGENOME_TABLE.txt"))
complete_mito = read_tsv(paste0(data_dir, "Complete_mitochondria_from_blast.txt"), col_names = c("ID_file", "Contig"))
Sys.setenv(PROJECTDIR=project_dir)
Sys.setenv(VARDIR=VAR_dir)
Sys.setenv(VCFDIR=vcf_dir)
Sys.setenv(POPSTR=PopStr_dir)
Sys.setenv(MITOPOPSTR=mito_PS_dir)
Sys.setenv(SUMST=Sumstats_dir)
Sys.setenv(GEADIR=GEA_dir)
Sys.setenv(ZTLIST=Zt_list)
Sys.setenv(GFFFILE = gff_file)
Sys.setenv(REFFILE = ref_fasta_file)
Sys.setenv(VCFNAME=vcf_name)
Sys.setenv(VCFNAME_NOMAF=vcf_name_nomaf)
Sys.setenv(VCFNAME_MITO=vcf_name_mito)
#knitr::opts_chunk$set(echo = F)
knitr::opts_chunk$set(message = F)
knitr::opts_chunk$set(warning = F)
knitr::opts_chunk$set(results = T)
# Metadata and sample lists
##Filtered_samples
filtered_samples = bind_rows(
read_tsv(paste0(metadata_dir, "Sample_removed_based_on_IBS.args"), col_names = "ID_file") %>%
mutate(Filter = "IBS"),
read_tsv(paste0(metadata_dir, "Sample_with_too_much_NA.args"), col_names = "ID_file") %>%
mutate(Filter = "High_NA"),
read_tsv(paste0(metadata_dir, "Samples_to_filter_out.args"), col_names = "ID_file") %>%
mutate(Filter = "Mutants_etc"))
##Samples in vcf
genotyped_samples = read_tsv(Zt_list, col_names = "ID_file")
## Metadata of genotyped samples
temp = read_tsv(paste0(metadata_dir, metadata_name, "_Description.tab"), col_names = F) %>% pull()
Zt_meta = read_delim(paste0(metadata_dir, metadata_name, "_with_collection.tab"),
col_names = temp,
na = "\\N", guess_max = 2000) %>%
unite(Coordinates, Latitude, Longitude, sep = ";", remove = F) %>%
inner_join(., genotyped_samples) %>%
mutate(Country = ifelse(Country == "USA", paste(Country, Region, sep = "_"), Country)) %>%
mutate(Country = ifelse(Country == "Australia", paste(Country, Region, sep = "_"), Country)) %>%
mutate(Country = ifelse(Country == "NZ", "New Zealand", Country)) %>%
mutate(Country = ifelse(Country == "CH", "Switzerland", Country)) %>%
mutate(Latitude2 = round(Latitude, 2), Longitude2 = round(Longitude, 2)) %>%
dplyr::select(ID_file, Sampling_Date, Collection,
Country, Continent, Latitude, Longitude, Latitude2, Longitude2)
#genotyped_samples %>%
# filter(!(ID_file %in% filtered_samples$ID_file)) %>%
# write_tsv(Zt_list, col_names = F)
#Define colors
## For continents
#myColors <- c("#04078B", "#a10208", "#FFBA08", "#CC0044", "#5C9D06", "#129EBA","#305D1B")
myColors <- c("#DA4167", "grey", "#ffba0a", "#A20106", "#3F6E0C", "#129eba", "#8fa253" )
names(myColors) = levels(factor(Zt_meta$Continent))
Color_Continent = ggplot2::scale_colour_manual(name = "Continent", values = myColors)
Fill_Continent = ggplot2::scale_fill_manual(name = "Continent", values = myColors)
## For clustering
K_colors = c("#f9c74f", "#f9844a", "#90be6d", "#f5cac3",
"#83c5be", "#f28482", "#577590", "#e5e5e5", "#a09abc", "#52796f",
"#219ebc", "#003049", "#82c0cc", "#283618", "white")
## For correlations
mycolorsCorrel<- colorRampPalette(c("#0f8b8d", "white", "#a8201a"))(20)
#Random gradients
greens=c("#1B512D", "#669046", "#8CB053", "#B1CF5F", "#514F59")
blues=c("#08386E", "#1C89C9", "#28A7C0", "#B0DFE8", "grey")
# Uploading commands from the cluster to my computer.
# 1 - Variant calling - Nuclear
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall* ~/Documents/Postdoc_Bruce/Projects/WW_project/1_Variant_calling/4_Joint_calling/
rsync -avP alice@130.125.25.244:/home/alice/WW_PopGen/Keep_lists_samples/Ztritici_global_March2021.genotyped.good_samples.args \
../WW_PopGen/Keep_lists_samples/
# 1 - Variant calling - Mito
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.mt.filtered.clean.AB_filtered.variants.good_samples.max-m-80.recode.vcf.gz ~/Documents/Postdoc_Bruce/Projects/WW_project/1_Variant_calling/4_Joint_calling/
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/0_Data/4_Mitochondrial_genome/0_Mito_blast/Complete_mitochondria_from_blast.txt ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/
# 1 - Variant calling - Depth
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.1kb_windows.nuc_GC.tab ../0_Data/
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/1_Depth_per_window/Depth_per*summary.q30.txt \
~/Documents/Postdoc_Bruce/Projects/WW_project/1_Variant_calling/1_Depth_per_window/
# Depth_per_sample_core_chr_summary.q30.txt
# Depth_per_sample_summary.q30.txt
# Depth_per_chromosome_summary.q30.txt
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/2_Depth_per_gene/Depth_per_genes_normalized.txt \
../1_Variant_calling/2_Depth_per_gene/
# 1 - Variant calling - SV
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/6_Mito_SV/All_complete_mitochondria.* \
~/Documents/Postdoc_Bruce/Projects/WW_project/1_Variant_calling/6_Mito_SV/
#All_complete_mitochondria.mcoords
#All_complete_mitochondria.snps
# 2 -Population structure
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/2_Population_structure/1_Mitochondrial_genome/Mitochondria.blast_results.tsv\
../2_Population_structure/1_Mitochondrial_genome/
# 4 - TE and RIP
rsync -avP \
alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Nb_reads* \
~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/0_RIP_estimation/
# Nb_reads_per_TE.txt
# Nb_reads.txt
rsync -avP \
alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Composite_index.txt \
~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/0_RIP_estimation/
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/1_Blast_dim2_deRIPped/ \
~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/
# 5 - GEA
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/5_GEA/output/* ~/Documents/Postdoc_Bruce/Projects/WW_project/5_GEA/
# 6 - Fungicides
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/6_Fungicide_resistance/genotypes_tidy_format.tab ~/Documents/Postdoc_Bruce/Projects/WW_project/6_Fungicide_resistance/
The sampling of Z.tritici isolated from the natural environment covers almost the entirety of the wheat-grown continents. It is, however, highly heterogeous. Europe has the highest sampling density. Several locations are heavily sampled, such a fields in Switzerland or the US. Some of the available genomes are still under embargo (waiting for the publication of their creator). We are also thinking about sequencing more genomes. Here you can view these different datasets on a map. Please select and unselect the different values to have a view of the changes with added datasets.
#kable(Zt_meta %>% dplyr::count(Collection, name = "Number of genomes"))
Zt_meta %>% dplyr::count(Collection, name = "Number of genomes")
## # A tibble: 18 x 2
## Collection `Number of genomes`
## <chr> <int>
## 1 BIOGER_Thierry 17
## 2 Eschikon_2017 94
## 3 ETHZ_2020 128
## 4 Fillinger 2
## 5 GWASpanel_BIOGER 90
## 6 Hartmann_FstQst_2015 132
## 7 Hartmann_Oregon_2016 94
## 8 JGI 43
## 9 JGI_1 1
## 10 JGI_2 32
## 11 JGI_3 20
## 12 JGI_4 3
## 13 JGI_Thierry 43
## 14 Megan_McDonald 13
## 15 MMcDonald_2018 92
## 16 Syngenta 283
## 17 Third_batch_2018_BM_TM 16
## 18 <NA> 6
max_circle = max(Zt_meta %>%
dplyr::count(Latitude, Longitude, name = "Number_genomes") %>%
dplyr::select(Number_genomes))
temp = Zt_meta %>%
dplyr::count(Country, Latitude, Longitude, name = "Number_genomes") %>%
filter(Number_genomes > 0)
empty_map = ggplot() + theme_void() +
geom_polygon(data = world, aes(x=long, y = lat, group = group), fill="#ede7e3", alpha=0.7)
empty_map +
geom_point (data = temp, aes(x=as.numeric(Longitude), y=as.numeric(Latitude),
size=Number_genomes,
text = Country),
alpha = 0.6, color = "#16697a") +
scale_size("Number of genomes", limits = c(1, max_circle))
p1 = empty_map +
geom_point(data = temp,
aes(x=as.numeric(Longitude), y=as.numeric(Latitude),size=Number_genomes, text = Country),
alpha = 0.6, color = "#ff9f1c") +
scale_size("Number of genomes", limits = c(1, max_circle)) +
coord_cartesian(xlim=c(-20, 60), ylim=c(20, 70)) +
theme(legend.position = "none")
p2 = empty_map +
geom_point(data = temp,
aes(x=as.numeric(Longitude), y=as.numeric(Latitude),size=Number_genomes, text = Country),
alpha = 0.6, color = "#ff9f1c") +
scale_size("Number of genomes", limits = c(1, max_circle)) + coord_cartesian(xlim=c(-160, -40), ylim=c(-80, 80)) +
theme(legend.position = "none")
p3 = empty_map +
geom_point(data = temp,
aes(x=as.numeric(Longitude), y=as.numeric(Latitude),size=Number_genomes, text = Country),
alpha = 0.6, color = "#ff9f1c") +
scale_size("Number of genomes", limits = c(1, max_circle)) + coord_cartesian(xlim=c(115, 175), ylim=c(-60, 10))
#Plotting all the maps together!
aus_map = cowplot::plot_grid(p3+ theme(legend.position = "none"), get_legend(p3), ncol = 2, rel_widths = c(1, 0.9))
ligne = cowplot::plot_grid(p1, aus_map, ncol = 1, rel_heights = c(1, 0.7))
cowplot::plot_grid(p2, ligne, ncol = 2, rel_widths = c(0.7, 1))
The sampling also covers a wide range of years: starting from 1990 to 2017. Just as with the geographical repartition, some years are heavily sampled, reflecting sampling in specific fields done for previous experiments.
temp = as_tibble(c(min(Zt_meta$Sampling_Date, na.rm = T) : max(Zt_meta$Sampling_Date, na.rm = T))) %>%
mutate(`Sampling year` = as.character(value))
sum_temp = Zt_meta %>%
mutate(`Sampling year` = as.character(Sampling_Date)) %>%
dplyr::count(`Sampling year`, Continent) %>%
full_join(., temp) %>%
mutate(`Genome number` = replace_na(n, 0))
sum_temp %>%
ggplot(aes(x=`Sampling year`, y =`Genome number`, fill = Continent)) +
geom_bar(stat = "identity") +
theme_bw() + theme(axis.title = element_blank(),
axis.text.x = element_text(angle = 60, hjust = 1)) +
Fill_Continent
countries = dplyr::count(Zt_meta, Country, Continent)
countries %>% filter(n >= 8) %>%
ggplot(aes(x = Country, y =n, fill = Continent)) +
geom_bar(stat = "identity") +
Fill_Continent +
theme_cowplot() +
labs(x = element_blank(), y = "Number of samples") +
theme(axis.text.x = element_text(angle = 40, hjust = 1, vjust = 1))
for (country in filter(countries, n >= 8) %>% pull(Country)) {
country_name = str_replace(country, pattern = " ", replacement = "_")
file_name = paste0(PopStr_dir, "Sample_list_", country_name, ".args")
Zt_meta %>% filter(Country == country) %>%
dplyr::select(ID_file) %>%
write_tsv(file = file_name, col_names = F)
}
for ( min_number in c(6, 10) ) {
small_pops = Zt_meta %>%
filter(!is.na(Country) & !is.na(Sampling_Date)) %>%
dplyr::count(Country, Latitude2, Longitude2, Sampling_Date, name = "Number_genomes") %>%
filter( Number_genomes >= min_number )
map1 = ggplot() +
theme_void() +
geom_polygon(data = map_data("world"), aes(x=long, y = lat, group = group), fill="#ede7e3") +
scale_size("Number of genomes", limits = c(1, max_circle)) +
theme(legend.position = "None",
panel.border = element_rect(fill=NA, colour = "grey",size = 0.5, linetype = 1))+
scale_fill_manual(values = c("grey", K_colors) )
p1 = map1 + coord_cartesian(xlim=c(-20, 60), ylim=c(20, 70)) +
geom_point(data = small_pops,
mapping = aes(x=Longitude2, y=Latitude2, size=Number_genomes),
alpha = 0.6, color = "#16697a")
p2 = map1 + coord_cartesian(xlim=c(-160, -40), ylim=c(-80, 80)) +
geom_point(data = small_pops,
mapping = aes(x=Longitude2, y=Latitude2, size=Number_genomes),
alpha = 0.6, color = "#16697a")
p3 = map1 + coord_cartesian(xlim=c(115, 175), ylim=c(-65, 10)) +
geom_point(data = small_pops,
mapping = aes(x=Longitude2, y=Latitude2, size=Number_genomes),
alpha = 0.6, color = "#16697a")
aus_map = cowplot::plot_grid(p3, get_legend(p3), ncol = 2, rel_widths = c(1, 0.9))
ligne = cowplot::plot_grid(p1, aus_map, ncol = 1, rel_heights = c(1, 0.7))
cowplot::plot_grid(p2, ligne, ncol = 2, rel_widths = c(0.7, 1))
for (t in 1:nrow(small_pops)) {
country = pull(small_pops[t, "Country"])
year = pull(small_pops[t, "Sampling_Date"])
lat = pull(small_pops[t, "Latitude2"])
long = pull(small_pops[t, "Longitude2"])
country_name = str_replace(country, pattern = " ", replacement = "_")
for (i in 1:10 ){
file_name = paste0(PopStr_dir, "Sample_list_subset_",
paste(min_number, country_name, year, lat, long, i, sep = "_"),
".args")
temp = Zt_meta %>%
filter(Country == country) %>%
filter(Sampling_Date == year) %>%
filter(Latitude2 == lat) %>%
filter(Longitude2 == long) %>%
dplyr::select(ID_file) %>%
pull()
write_tsv(as.tibble(sample(temp, size = min_number)), file = file_name, col_names = F)
}
}
}
Question: How prelavent is aneuploidy in natural populations of Z.tritici? In the case of accessory chromosomes, is there a correlation between phylogeny, environment, host or time and the presence/absence of some chromosomes?
Methods: Based on the depth of coverage for all samples, we can identify for both core and accessory chromosomes whether each isolates includes 1, 0 or several copies.
core_depth = read_tsv(paste0(depth_per_window_dir, "Depth_per_sample_core_chr_summary.q30.txt")) %>%
mutate(Median_core = Median)
chrom_depth = read_tsv(paste0(depth_per_window_dir, "Depth_per_chromosome_summary.q30.txt")) %>%
left_join(., core_depth %>%
dplyr::select(Sample, Median_core)) %>%
mutate(Relative_depth = Median/Median_core) %>%
left_join(.,Zt_meta, by = c("Sample" = "ID_file")) %>%
filter(CHROM != "mt") %>%
mutate(Sample = fct_reorder(Sample, Sampling_Date)) %>%
mutate(Sample = fct_reorder(Sample, Country)) %>%
mutate(Sample = fct_reorder(Sample, Continent))
heatmap_depth = chrom_depth %>%
filter(CHROM != "mt") %>%
ggplot(aes(x = as.numeric(CHROM), y=Sample, fill=Relative_depth)) +
geom_tile() + scale_fill_gradient2(low="white", high = "#479DAE") +
geom_vline(xintercept = 13.5, linetype = "longdash", colour = "gray20") +
theme(axis.text.y = element_blank(),
axis.title.y = element_blank(), axis.ticks.y=element_blank()) +
labs(fill = "Depth", x= "Chromosome") + xlim(c(0.5, 21.5))
plot_continent = chrom_depth %>%
ggplot(aes(x = 1, y=Sample, fill=Continent)) +
geom_tile(aes(width = 2)) +
theme_classic() +
theme(axis.text.y = element_blank(), axis.ticks.y=element_blank(),
legend.position="left",
axis.text.x=element_text(colour="white")) +
labs(y= "Isolate") + Fill_Continent
plot_grid(plot_continent, heatmap_depth, rel_widths = c(2, 5))
In the heatmap, I represent the depth normalized by the median depth over all core chromosomes. As expected, the copy-number variation at the chromosome scale affects mostly the accessory chromosomes (AC). There is some presence of supernumerary AC and a lot of presence-absence variation. Supernumerary chromosomes can also be found in the core chromosomes but this is almost anecdotal as over the whole sampling this was found only in 9 cases.
Lthres = 0.50
Hthres = 1.50
depth = chrom_depth %>%
dplyr::mutate(Depth_is = ifelse(Relative_depth > Hthres, "High",ifelse(Relative_depth < Lthres, "Low", "Normal"))) %>%
mutate(CHROM = fct_reorder(CHROM, as.numeric(CHROM)))
bar_Ndepth_per_CHR =ggplot(depth, aes(x = CHROM, fill = Depth_is)) +
geom_bar(stat = "count") +
scale_fill_manual(values =c("#16697a", "#82c0cc", "#EDE7E3")) +
theme_light()+
labs(x= "Chromosome", y = "Number of isolates")
#lollipop plots
##For high normalized depth values
temp = depth %>%
filter(Depth_is == "High") %>%
dplyr::group_by(CHROM) %>%
dplyr::count() %>%
mutate(CHROM = fct_reorder(CHROM, as.numeric(CHROM)))
lolhigh = ggplot(temp, aes(x = as.character(CHROM), y = n)) +
geom_segment( aes(x=as.character(CHROM), xend=as.character(CHROM), y=0, yend=max(temp$n)),
color="grey80", size = 1) +
geom_segment( aes(x=as.character(CHROM), xend=as.character(CHROM), y=0, yend=n),
color="grey20", size = 1) +
geom_point( color="#16697a", size=4) +
geom_text(aes( label = n,
y= n), stat= "identity",
hjust = -0.5, vjust = -0.2) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10)
) +
ylim(c(0,max(temp$n)+2+ max(temp$n)*0.1)) +
labs(x= "Chromosome", y = "Number of isolates with supernumerary chromosome") +
coord_flip()
##For low normalized depth values
temp = depth %>%
filter(Depth_is == "Low") %>%
dplyr::group_by(CHROM) %>%
dplyr::count()%>%
mutate(CHROM = fct_reorder(CHROM, as.numeric(CHROM)))
lollow = ggplot(temp, aes(x = CHROM, y = n)) +
geom_segment( aes(x=CHROM, xend=CHROM, y=0, yend=max(temp$n)),
color="grey80", size = 1) +
geom_segment( aes(x=CHROM, xend=CHROM, y=0, yend=n),
color="grey20", size = 1) +
geom_point( color="#82c0cc", size=4) +
geom_text(aes( label = n,
y= n), stat= "identity",
hjust = -0.5, vjust = -0.2) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10)
) +
ylim(c(0,max(temp$n)+ max(temp$n)*0.1)) +
labs( x= "Chromosome", y = "Number of isolates without chromosome") +
coord_flip()
bottom_row <- cowplot::plot_grid(lolhigh, lollow, labels = c('B', 'C'), label_size = 12)
plot_grid(bar_Ndepth_per_CHR, bottom_row, labels = c('A', ''), label_size = 12, ncol = 1)
Here is a table including the isolates with supernumerary core chromosomes.
depth %>%
dplyr::filter(Depth_is == "High") %>%
dplyr::filter(as.numeric(CHROM) < 13) %>%
dplyr::select(Sample, CHROM, Median, Median_core, Collection, Country)
## # A tibble: 18 x 6
## Sample CHROM Median Median_core Collection Country
## <fct> <fct> <dbl> <dbl> <chr> <chr>
## 1 08STCZ011 12 47 24 Syngenta Czech Repub…
## 2 08STF035 5 154 79 Syngenta France
## 3 08STF036 12 159 82 Syngenta France
## 4 09STD041 4 157 86 Syngenta Germany
## 5 ST00ARG_BD0069 5 17 9 <NA> <NA>
## 6 ST13SP_Biog_SeptoDu… 4 100 49 BIOGER_Thierry Spain
## 7 ST16CH_2X10 1 2 1 <NA> <NA>
## 8 ST16CH_2X10 2 2 1 <NA> <NA>
## 9 ST16CH_2X10 3 2 1 <NA> <NA>
## 10 ST16DK_Biog_DK15 5 16 8 <NA> <NA>
## 11 ST90ORE_a12_3B_11 12 32 19 Hartmann_FstQst_2… USA_Oregon
## 12 STnnJGI_SRR4235099 12 28 17 JGI_2 USA_Oregon
## 13 STnnJGI_SRR4235109 5 33 17 JGI_2 Switzerland
## 14 STnnJGI_SRR5194526 12 78 40 JGI Sweden
## 15 STnnJGI_SRR5194528 5 84 42 JGI Sweden
## 16 STnnJGI_SRR5194605 5 73 37 JGI Hungary
## 17 STnnJGI_SRR5829692 4 165 84 JGI Kenya
## 18 STnnJGI_SRR5829692 5 163 84 JGI Kenya
And an overlook of the accessory chromosomes PAV per continent (only considering continents with more than 10 isolates).
depth %>%
dplyr::filter(as.numeric(CHROM) > 13) %>%
dplyr::group_by(Continent, CHROM) %>%
dplyr::mutate(Count_sample_per_continent = n()) %>%
dplyr::filter(Count_sample_per_continent >= 10) %>%
ggplot(aes(x = Continent, fill = Depth_is)) +
geom_bar(position = "fill" ) + facet_wrap(CHROM~.) +
theme_light() +
scale_fill_manual(values = c("#16697a", "#82c0cc", "#EDE7E3")) +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
ylab("Proportion of chromosomes") + xlab("")
Estimates for the GC bias were done per isolate in the Depth.Rmd script. Here, I simply load this data and represent it graphically.
#
GC_per_window = read_tsv(paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.1kb_windows.nuc_GC.tab")) %>%
mutate(GC = round(`5_pct_gc`, 2)) %>%
filter( !is.na(`#1_usercol`))
temp = GC_per_window %>%
filter(`#1_usercol` < 14) %>%
dplyr::select(CHROM = `#1_usercol`, Win_start = `2_usercol`, Win_stop = `3_usercol`, GC) %>%
group_by(GC) %>%
dplyr::count()
# Histogram of GC values
xint = median(GC_per_window$GC)
temp %>%
ggplot(aes(x = GC, y = n, fill = n > 40)) +
geom_bar(stat = "identity") +
theme_light() +
scale_fill_manual(values =c( "#82c0cc", "#EDE7E3"))+
geom_vline(xintercept = xint, col = "#82c0cc") +
annotate("text", label = paste0("Median GC is ", xint),
x = xint - xint*0.2, y = 5200,
size = 4, colour = "#82c0cc", fontface = "bold")
The distribution of GC is slightly skewed in the GC-rich side, with a median above 0.5. However, there is an AT-rich fat tail, or even a bimodal distribution. These windows are most probably containing RIP-affected repeated regions.
From there, we need to know if the different collections present GC bias in their sequencing. We expect a batch effect in the sequencing here since we have data coming from a lot of different years and labs (and because batch effect happens in general).
GC_bias = read_tsv(paste0(depth_per_window_dir, "GC_bias_per_sample.txt"))
temp = GC_bias %>%
filter(Collection != "\\N") %>%
group_by(Collection) %>%
dplyr::count() %>%
filter(n >= 20)
inner_join(temp, GC_bias) %>%
group_by(Collection) %>%
mutate(Median = median(GC_bias_slope)) %>%
ggplot(., aes(x = reorder(Collection, Median), y = GC_bias_slope, fill = reorder(Collection, Median))) +
geom_violin(alpha = .7) +
theme_bw() +
theme(axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1),
plot.margin = unit(c(0.5, 0.5, 0.5, 1), "cm")) +
labs(x = "Collection", fill = "Collection", y = "GC bias estimate")
It is very clear that we do have a batch effect but also that some datasets are very, very biased. The older Hartmann data (also called Qst population) in particular is strongly biased.
Question: Is the world-wide population of Z.tritici structured? If so, is it structured according to geography, host or time (or any other relevant info we hopefully have)?
Previous genomic work has shown very clear structure between populations of Z.tritici. However,the sampling was extremely heterogeneous. With a more geographically even sampling, do we also observe a clear-cut structure?
Methods: Analyses to create would be:
The clustering here is done by using the snmf method from the LEA R package (http://membres-timc.imag.fr/Olivier.Francois/LEA/) on the same subset of SNPs as the PCA, but without any missing data. I ran the analysis for a K (number of cluster inferred) ranging from 1 to 15 and with 10 repeats for each K.
vcftools --vcf ${VCFDIR}$VCFNAME.recode.vcf \
--extract-FORMAT-info GT \
--out ${POPSTR}$VCFNAME
cat ${POPSTR}$VCFNAME.GT.FORMAT | cut -f 3- \
> ${POPSTR}$VCFNAME.GT.FORMAT2
cat ${POPSTR}$VCFNAME.GT.FORMAT | cut -f 1,2 \
> ${POPSTR}$VCFNAME.GT.FORMAT.pos
head -n1 ${POPSTR}$VCFNAME.GT.FORMAT2 | gsed "s/\t/\n/g" \
> ${POPSTR}$VCFNAME.ind
gsed "s/\t//g" ${POPSTR}$VCFNAME.GT.FORMAT2 | tail -n +2 \
> ${POPSTR}$VCFNAME.geno
datamash transpose < ~/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp.even_sampling.GT.FORMAT2 | sed 's/^/>/' | sed 's/\t/\n/' | sed 's/\t//g' | cut -c -150 > ~/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp.even_sampling.fasta
#project = snmf(paste0(PopStr_dir, vcf_name, ".geno"), K=1:15, entropy = TRUE,
# repetitions = 10, project = "new", ploidy = 1)
# Reading the results from the snmf runs
# ______________________________________
#Sample names
indv_snmf = read_tsv(paste0(PopStr_dir, vcf_name, ".ind"), col_names = F)
names(indv_snmf) = "Sample"
#Load project
project = load.snmfProject(paste0(PopStr_dir, vcf_name, ".snmfProject"))
K_list = c(1:15)
#Extracting the clustering info for the best run per K
datalist = list()
for (i in K_list){
best = which.min(cross.entropy(project, K = i))
temp = as.data.frame(Q(project, i, best))
temp= cbind(indv_snmf, temp)
temp = temp %>%
gather("Cluster", "Admix_coef", -"Sample") %>%
mutate(K=i)
datalist[[i]] = as.tibble(temp)
}
snmf_results_per_K = bind_rows(datalist) %>%
inner_join(., Zt_meta, by = c("Sample" = "ID_file")) %>%
unite(Continent, Country, col = "for_display", remove = F)
#sNMF pretty plots
# _______________
afiles = character(length(K_list))
for (i in K_list){
best = which.min(cross.entropy(project, K = i))
afiles[i] = Sys.glob(paste0(PopStr_dir, vcf_name, ".snmf/K",i, "/run", best, "/*Q"))
}
# create a qlist
qlist <- readQBasic(afiles)
al_qlist = alignK(qlist)
lab_set = inner_join(indv_snmf, Zt_meta, by = c("Sample" = "ID_file")) %>%
dplyr::select(Continent, Country) %>%
mutate(Continent = ifelse(is.na(Continent), "Unknown", Continent),
Country = ifelse(is.na(Country), "Unknown", Country))
#Low numbers
from = 2
up_to = 6
p1 <- plotQ(alignK(qlist[from:up_to], type = "across"),
imgoutput="join",
returnplot=T,exportplot=F,
basesize=11,
splab= paste0("K=", K_list[from:up_to]),
grplab=lab_set, ordergrp=T, grplabangle = 40, grplabheight = 2, grplabsize = 2,
clustercol = K_colors)
grid.arrange(p1$plot[[1]])
#Medium numbers
from = 7
up_to = 11
p2 <- plotQ(alignK(qlist[from:up_to], type = "across"),
imgoutput="join",
returnplot=T,exportplot=F,
basesize=11,
splab= paste0("K=", K_list[from:up_to]),
grplab=lab_set, ordergrp=T, grplabangle = 40, grplabheight = 2, grplabsize = 2,
clustercol = K_colors)
grid.arrange(p2$plot[[1]])
#High numbers
from = 12
up_to = 15
p3 <- plotQ(alignK(qlist[from:up_to], type = "across"),
imgoutput="join",
returnplot=T,exportplot=F,
basesize=11,
splab= paste0("K=", K_list[from:up_to]),
grplab=lab_set, ordergrp=T, grplabangle = 40, grplabheight = 2, grplabsize = 2,
clustercol = K_colors)
grid.arrange(p3$plot[[1]])
The results from the PCA and from the clutering analysis are coherent: Oceania separates into 3 clusters (one in New_Zealand, and two in Australia) and the North American isolates form two separate clusters. Higher K values also distinguish a Middle-Eastern/African cluster from the European cluster, representing the two extreme points of the gradient found between these populations in the PCA. Despite the high number of isolates from Europe, it’s interesting to see that no clustering appears there.
I need to identify the isolates which belong to each of the clusters. For this, we need to set a threshold, since there are very few (or even no) isolates assigned fully to one only. I test two thresholds: isolates assigned to one cluster with a value higher than 0.75 and 0.9.
#Setting thresholds to compare
chosen_threshold = 0.75
chosen_threshold2 = 0.9
pure_by_threshold = bind_rows(snmf_results_per_K %>%
filter(Admix_coef > chosen_threshold) %>%
mutate(Threshold = paste0("> ", chosen_threshold)),
snmf_results_per_K %>%
filter(Admix_coef > chosen_threshold2) %>%
mutate(Threshold = paste0("> ", chosen_threshold2)))
# Number of pure isolates per K
pure_by_threshold %>%
dplyr::group_by(K, Threshold) %>%
dplyr::count() %>%
ggplot(aes(x = as.factor(K), y = n, fill = Threshold)) +
geom_bar(stat = "identity", position = "dodge") +
theme_bw() + scale_fill_manual(values = c("#16697a", "#82c0cc"))+
labs(x = "K", y = "Number of isolates", title = "Pure isolates per K")
# Number of pure isolates per K per country
temp2 = pure_by_threshold %>%
dplyr::group_by(Continent, for_display, Cluster, K, Threshold) %>%
dplyr::count()
temp2 %>%
filter(K > 9 & K < 16) %>%
ggplot(aes(x = Cluster, y = for_display,
size = n, color = Continent)) +
geom_point(alpha = 0.3) + Color_Continent+ theme_bw() +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
facet_grid(rows = vars(Threshold), cols = vars(K), scales = "free_x") +
labs(x = "", y = "", size = "Nb of isolates",
title = "Pure isolates per country and per K")
Choosing the number K of clusters that best represent the data at hand is always a challenge. First, let’s look at the cross-validation results. sNMF estimates an entropy criterion which evaluates the quality of fit of the model to the data, potentially helping to find the best number of ancestral populations.
#Dot plot of the cross-entropy
#plot(project, col = "goldenrod", pch = 19, cex = 1.2) #native method
cross_ent = list()
for (i in K_list){
cross_ent[[i]] = as_tibble(cross.entropy(project, K = i), rownames = "NA") %>%
mutate( K = i)
colnames(cross_ent[[i]] ) <- c("Run", "Crossentropy", "K")
}
cross_ent = bind_rows(cross_ent)
temp = cross_ent %>% group_by(K) %>% dplyr::summarize(average = mean(Crossentropy), minimum = min(Crossentropy))
p1 = ggplot() +
geom_point(data = cross_ent, aes(x = as.factor(K), y = Crossentropy), alpha = 0.4, size = 2, col = "#82c0cc") +
geom_point(data = temp, aes(x = as.factor(K), y = minimum), alpha = 0.4, size = 2, col = "#16697a") +
theme_cowplot() +
labs(x = "K", y = "Cross-entropy")
p2 = pure_by_threshold %>%
filter(Threshold == paste0("> ", chosen_threshold)) %>%
filter(K > 1) %>%
dplyr::group_by(Cluster, K) %>%
dplyr::count()%>%
group_by(K) %>%
dplyr::summarize(Size_smallest_cluster = min(n)) %>%
ggplot(aes(x = as.factor(K), label = Size_smallest_cluster)) +
geom_bar(aes(y = Size_smallest_cluster, fill = K), stat = "identity") +
theme_cowplot() +
geom_label(aes(y = Size_smallest_cluster + 7)) +
scale_fill_continuous(high = "#16697a", low = "#82c0cc") +
labs(x = "K", y = "Size of the smallest cluster")
p3 = pure_by_threshold %>%
filter(Threshold == paste0("> ", chosen_threshold)) %>%
dplyr::group_by(K) %>%
dplyr::count() %>%
ggplot(aes(x = as.factor(K), y = n, fill = K)) +
geom_bar(stat = "identity", position = "dodge") +
theme_bw() +
scale_fill_continuous(high = "#16697a", low = "#82c0cc") +
labs(x = "K", y = "Number of assigned samples")
top_row = cowplot::plot_grid(p1, p3 + theme(legend.position = "none"), ncol = 2)
cowplot::plot_grid(top_row, p2 + theme(legend.position = "none"), ncol = 1)
In the case of our analysis, we do not have a very clear-cut minimum value for the cross-entropy criterion value. There is however a plateau starting from K=10. I also look at the number of samples assigned to one cluster or another based on the threshold set previously, as well as at the smallest cluster size. Even though, we might get hints that a very real cluster exists in one specific population, if the cluster size is very small, I will not be able to estimate anything with them so there is no point in going to the level of mini-clusters!
#These values are chosen based on the plot above
chosen_K = 11
write_tsv(snmf_results_per_K %>% filter(K == chosen_K),
file = paste0(PopStr_dir, vcf_name, ".snmf_results_chosen_K.tab"))
#Table
kable(snmf_results_per_K %>%
filter(K == chosen_K) %>%
filter(Admix_coef > chosen_threshold) %>%
dplyr::group_by(Continent, Cluster) %>%
dplyr::count() %>%
pivot_wider(names_from = Continent, values_from =n, values_fill = 0))
| Cluster | Africa | Europe | Middle East | North America | Oceania | South America | NA |
|---|---|---|---|---|---|---|---|
| V11 | 21 | 8 | 2 | 0 | 0 | 0 | 0 |
| V2 | 4 | 553 | 0 | 0 | 0 | 0 | 2 |
| V6 | 0 | 0 | 40 | 0 | 0 | 0 | 0 |
| V7 | 0 | 0 | 16 | 0 | 0 | 0 | 0 |
| V10 | 0 | 0 | 0 | 178 | 0 | 0 | 1 |
| V4 | 0 | 0 | 0 | 1 | 36 | 0 | 0 |
| V8 | 0 | 0 | 0 | 20 | 0 | 0 | 0 |
| V1 | 0 | 0 | 0 | 0 | 43 | 0 | 0 |
| V3 | 0 | 0 | 0 | 0 | 31 | 0 | 0 |
| V5 | 0 | 0 | 0 | 0 | 0 | 31 | 0 |
| V9 | 0 | 0 | 0 | 0 | 0 | 16 | 0 |
#Looking at individuals with admixture coef higher than the threshold defined above.
high_anc_coef_snmf = snmf_results_per_K %>%
filter(K == chosen_K) %>%
filter(Admix_coef > chosen_threshold)
##Bubble plot
p_cluster = high_anc_coef_snmf %>%
dplyr::group_by(Continent, for_display, Cluster) %>%
dplyr::count() %>%
ggplot(aes(x = for_display, y = Cluster,
size = n, color = Continent)) +
geom_point(alpha = 0.5) + Color_Continent+ theme_bw() +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(x = "",
title = "Pure isolates per country for the chosen K value",
subtitle = str_wrap(paste0("Number of genotypes with admix coef > ", chosen_threshold,
" assigned to ", chosen_K, " clusters."),
width = 70),
size = "Nb of isolates")
p_cluster
##Writing out tables for later
high_anc_coef_snmf %>% dplyr::select(Sample) %>%
write_tsv(., file = paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.ind"),
col_names = F)
high_anc_coef_snmf %>%
write_tsv(., file = paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.tsv"),
col_names = T)
max_continent = dplyr::count(high_anc_coef_snmf, Cluster, Continent) %>%
group_by(Cluster) %>%
mutate(somme = sum(n), prop = n/somme) %>%
filter(prop > 0.5) %>%
dplyr::select(Cluster, Continent)
clusters = dplyr::count(high_anc_coef_snmf, Cluster, Continent)
high_anc_coef_snmf %>%
dplyr::count(Cluster, Continent) %>%
group_by(Cluster) %>%
dplyr::mutate(Nb_per_cluster = sum(n)) %>%
filter(!is.na(Continent)) %>%
ggplot(aes(x = reorder(Cluster, Nb_per_cluster), y = n, fill = Continent)) +
geom_bar(stat = "identity") +
theme_bw() +
labs(x = element_blank(), y = "Number of samples") +
theme(axis.text.x = element_text(angle = 40, hjust = 1, vjust = 1)) +
Fill_Continent
#Files with all pure isolates per cluster
for (cluster in max_continent %>% pull(Cluster)) {
file_name = paste0(PopStr_dir, "Sample_list_", cluster, ".args")
high_anc_coef_snmf %>% filter(Cluster == cluster) %>%
dplyr::select(Sample) %>%
write_tsv(file = file_name, col_names = F)
}
#Files with a similar number pure isolates between cluster
minimum_cluster_size = min(dplyr::count(high_anc_coef_snmf, Cluster) %>% dplyr::select(n))
for (cluster in max_continent %>% pull(Cluster)) {
for (i in 1:10 ){
file_name = paste0(PopStr_dir, "Sample_list_", cluster, "_", i, ".args")
temp = high_anc_coef_snmf %>%
filter(Cluster == cluster) %>%
sample_n(size = minimum_cluster_size) %>%
dplyr::select(Sample)
write_tsv(temp, file = file_name, col_names = F)
}
}
The clustering data can also be represented as an average of ancestry coefficient per country. This is done here first with barplots.
chosen_coef = snmf_results_per_K %>%
filter(K == chosen_K) %>%
filter(Country != "NA") %>%
filter(Country != "USA_NA") %>%
dplyr::select(Sample, Continent, Country, Admix_coef, Cluster, Latitude, Longitude)
#Identifying the main cluster per country
cluster_per_country = high_anc_coef_snmf %>%
group_by(Country, Cluster) %>%
dplyr::summarise(n = n()) %>%
dplyr::mutate(freq = n / sum(n)) %>%
filter(freq > 0.5) %>%
dplyr::select(Country, Main_country_cluster = Cluster)
# Cluster composition per country for all continents
temp = chosen_coef %>%
group_by(Continent, Country, Cluster) %>%
dplyr::summarize(average_coef = mean(Admix_coef),
Nb_per_country = n()) %>%
dplyr::mutate(Cluster2 = ifelse(average_coef < 0.1, "Other", Cluster)) %>%
filter(Nb_per_country >= 6)
#Bar plot per country
temp %>% filter(Continent != "NA") %>%
filter(Continent != "Asia") %>%
ggplot(aes(x = Country, y = average_coef, fill = Cluster2)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("grey", K_colors) ) +
theme_cowplot() +
labs(subtitle = "All continents, limited to countries with 6 or more isolates",
title = "Cluster ancestry per country",
y = "Average of the ancestry coefficient", fill = "Cluster") +
facet_grid(cols = vars(Continent), switch = "x", scales = "free_x", space = "free_x") +
theme(axis.title.x = element_blank(),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(angle = 50, hjust = 1, vjust = 1, size = 8),
axis.text.y = element_text(hjust = 1, vjust = 1, size = 10),
panel.spacing = unit(0, "lines"),
strip.background = element_blank(),
strip.placement = "bottom",
strip.text = element_text(size = 10),
legend.position = "bottom")
Such visualization is useful but not as intuitive as a map! Let’s use the same summarising method but with a more local viewpoint (I use rounded coordinates instead of country).
#Summarizing based on samples found in neighbouring areas
temp = chosen_coef %>%
mutate(Latitude = round(Latitude/2)*2, Longitude = round(Longitude/2)*2)%>%
group_by(Continent, Country, Cluster, Longitude, Latitude) %>%
summarize(average_coef = mean(Admix_coef), number_of_isolates = n()) %>%
filter(!is.na(Latitude))
write_tsv(temp, file = paste0(PopStr_dir, vcf_name, ".chosen_coef_pies.tab"))
#Transforming to fit the scatter pie requirements
pies = temp %>% #filter(number_of_isolates > 2) %>%
mutate(Cluster2 = ifelse(average_coef < 0.1, "Other", Cluster)) %>%
group_by(Continent, Longitude, Latitude, Country, Cluster2, number_of_isolates) %>%
dplyr::summarize(to_plot = sum(average_coef)) %>%
arrange(Cluster2) %>%
pivot_wider(names_from = Cluster2, values_from = to_plot, values_fill = 0) %>%
mutate(radius = ifelse(number_of_isolates > 10, 1.5, log(number_of_isolates))) %>%
dplyr::select(-number_of_isolates)
# Simple map of the world
K_colors = c("#0D6E82", #V1 Aus (TAS)
"#49810E", #V10 USA
"#E16684", #V11 North Africa
"#FFBA0A", #V2 Europe
"#C7F1F9", #V3 NZ
"#21C7E8", #V4 Australia (NSW)
"#8FA253", #V5 Uruguay + Argentina
"#650104", #V6 Israel + Turkey
"#DE020A", #V7 Iran
"#2A4908", #V8 Canada
"#B3C186") #V9 Boliva + Chile + Ecuador
map1 = ggplot() +
theme_void() +
geom_polygon(data = map_data("world"), aes(x=long, y = lat, group = group), fill="#ede7e3") +
scale_size("Number of genomes", limits = c(1, max_circle)) +
theme(legend.position = "None",
panel.border = element_rect(fill=NA, colour = "grey",size = 0.5, linetype = 1)) +
scale_fill_manual(values = c(c("grey"), K_colors))
#Splitting the map into our 3 main focus areas
p1 = map1 + coord_cartesian(xlim=c(-20, 60), ylim=c(20, 70)) +
geom_scatterpie(data = pies, mapping = aes(x=Longitude, y=Latitude, group=Country, r = radius),
cols = c("Other", "V1", "V10", "V11", "V2", "V3", "V4",
"V5", "V6", "V7", "V8", "V9"), color=NA, alpha=.8)
p2 = map1 + coord_cartesian(xlim=c(-160, -40), ylim=c(-80, 80)) +
geom_scatterpie(data = pies, mapping = aes(x=Longitude, y=Latitude, group=Country, r = radius*2),
cols = c("Other", "V1", "V10", "V11", "V2", "V3", "V4",
"V5", "V6", "V7", "V8", "V9"), color=NA, alpha=.8)
p3 = map1 + coord_cartesian(xlim=c(115, 185), ylim=c(-65, 10)) +
geom_scatterpie(data = pies, mapping = aes(x=Longitude, y=Latitude, group=Country, r = radius*2),
cols = c("Other", "V1", "V10", "V11", "V2", "V3", "V4",
"V5", "V6", "V7", "V8", "V9"), color=NA, alpha=.8)
#Plotting all the maps together!
aus_map = cowplot::plot_grid(p3, get_legend(p1), ncol = 2, rel_widths = c(1, 0.75))
ligne = cowplot::plot_grid(p1, aus_map, ncol = 1, rel_heights = c(1, 0.7))
cowplot::plot_grid(p2, ligne, ncol = 2, rel_widths = c(0.75, 1))
#ggsave(paste0(fig_dir, "Str_scatter_pie.pdf"), width = 18, height = 10, units = "cm")
For each country, we define the local cluster with “votes”: the cluster to which more than half of the non-admixed isolates are originating is considered to be the local cluster. We can quantify for each country the isolates which are non-admixed (or “pure”) isolates from the local cluster, non-admixed from another cluster, or hybrid.
#For each sample, identify if is hybrid, local pure or pure from somewhere else.
status_admix = inner_join(chosen_coef, cluster_per_country) %>%
group_by(Sample, Continent, Country) %>%
dplyr::mutate(max_admix = max(Admix_coef),
max_cluster = ifelse(Admix_coef == max_admix, Cluster, "")) %>%
filter(max_cluster != "") %>%
dplyr::mutate(Status = ifelse(max_admix < chosen_threshold, "Hybrid",
ifelse(max_cluster == Main_country_cluster, "Pure local",
"Pure other")))
# Summary of hybrid category per country
temp = status_admix %>%
group_by(Continent, Country, Main_country_cluster) %>%
dplyr::count(Status)
## As table
kable(pivot_wider(temp, names_from = Status, values_from = n, values_fill = 0))
| Continent | Country | Main_country_cluster | Hybrid | Pure local | Pure other |
|---|---|---|---|---|---|
| Africa | Algeria | V11 | 3 | 5 | 0 |
| Africa | Kenya | V2 | 4 | 2 | 0 |
| Africa | Morocco | V11 | 0 | 2 | 0 |
| Africa | Tunisia | V11 | 1 | 14 | 2 |
| Europe | Belgium | V2 | 0 | 14 | 0 |
| Europe | Czech Republic | V2 | 2 | 18 | 0 |
| Europe | Denmark | V2 | 0 | 9 | 0 |
| Europe | France | V2 | 6 | 228 | 4 |
| Europe | Germany | V2 | 1 | 63 | 0 |
| Europe | Ireland | V2 | 0 | 36 | 0 |
| Europe | Italy | V11 | 1 | 2 | 0 |
| Europe | Latvia | V2 | 0 | 2 | 0 |
| Europe | Netherlands | V2 | 2 | 6 | 0 |
| Europe | Poland | V2 | 0 | 5 | 0 |
| Europe | Spain | V11 | 4 | 2 | 0 |
| Europe | Sweden | V2 | 0 | 6 | 0 |
| Europe | Switzerland | V2 | 2 | 135 | 0 |
| Europe | UK | V2 | 0 | 31 | 0 |
| Middle East | Iran | V7 | 6 | 16 | 0 |
| Middle East | Israel | V6 | 0 | 39 | 0 |
| Middle East | Syria | V11 | 6 | 1 | 0 |
| Middle East | Turkey | V11 | 10 | 1 | 0 |
| Middle East | Yemen | V6 | 0 | 1 | 0 |
| North America | Canada | V8 | 0 | 11 | 0 |
| North America | USA_California | V10 | 0 | 7 | 0 |
| North America | USA_Indiana | V10 | 4 | 7 | 0 |
| North America | USA_Minnesota | V8 | 0 | 2 | 0 |
| North America | USA_Missouri | V10 | 3 | 7 | 0 |
| North America | USA_North Dakota | V8 | 0 | 7 | 0 |
| North America | USA_Ohio | V10 | 0 | 3 | 0 |
| North America | USA_Oregon | V10 | 0 | 145 | 0 |
| North America | USA_Texas | V10 | 1 | 7 | 0 |
| Oceania | Australia_NA | V4 | 1 | 9 | 3 |
| Oceania | Australia_NSW | V4 | 0 | 27 | 0 |
| Oceania | Australia_Tasmania | V1 | 0 | 40 | 0 |
| Oceania | New Zealand | V3 | 24 | 31 | 0 |
| South America | Argentina | V5 | 0 | 16 | 0 |
| South America | Bolivia | V9 | 0 | 1 | 0 |
| South America | Chile | V9 | 0 | 14 | 0 |
| South America | Ecuador | V9 | 1 | 1 | 0 |
| South America | Uruguay | V5 | 1 | 15 | 0 |
## As a figure
ungroup(temp) %>%
dplyr::mutate(Nb_per_country = sum(n)) %>%
filter(Nb_per_country >= 10) %>%
ggplot(aes(x = Country, y = n, fill = Status)) +
geom_bar(stat = "identity", position = "fill") +
theme_cowplot() +
scale_fill_manual(values =c("#16697a", "#EDE7E3", "#82c0cc")) +
labs(y = "Proportion of isolates")+
facet_grid(rows = vars(Continent), switch = "y", scales = "free_y", space = "free_y") +
theme(axis.title.y = element_blank(),
axis.title.x = element_text(size = 14),
axis.text.x = element_text(size =10),
axis.text.y = element_text(hjust = 1, vjust = 1, size = 8),
panel.spacing = unit(0, "lines"),
strip.background = element_blank(),
strip.placement = "bottom",
strip.text = element_text(size = 10),
legend.position = "top") +
coord_flip()
#facet_wrap(vars(Continent), scales = "free")
#Table hybrids assigned to several clusters, and pure foreign
temp = inner_join(chosen_coef,
status_admix %>%
filter(Status != "Pure local") %>%
dplyr::select(Sample, Status)) %>%
dplyr::select(Sample, Continent, Country, Latitude, Longitude, Status, Cluster, Admix_coef) %>%
mutate(Admix_coef = ifelse(Admix_coef > 0.2, round(Admix_coef, 4), NA)) %>%
pivot_wider(names_from = Cluster, values_from = Admix_coef) %>%
rowwise() %>%
dplyr::mutate(Count_NA = sum(is.na(c_across(V1:V11)))) %>%
filter(Status == "Pure other" | Count_NA < 10)
opts <- options(knitr.kable.NA = "")
kable(temp, digits = 3, )
| Sample | Continent | Country | Latitude | Longitude | Status | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | V11 | Count_NA |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Indiana_1993_I25a_1 | North America | USA_Indiana | 40.420 | -86.920 | Hybrid | 0.224 | 0.706 | 9 | |||||||||
| Missouri_1994_ST94MO15a_1 | North America | USA_Missouri | 38.890 | -92.210 | Hybrid | 0.221 | 0.698 | 9 | |||||||||
| ST08TN_26_5_4 | Africa | Tunisia | 36.810 | 10.179 | Pure other | 0.805 | 10 | ||||||||||
| ST13FR_Biog_SeptoDur10 | Europe | France | 44.218 | 0.448 | Hybrid | 0.368 | 0.608 | 9 | |||||||||
| ST13FR_Biog_SeptoDur12 | Europe | France | 44.218 | 0.448 | Hybrid | 0.449 | 0.548 | 9 | |||||||||
| ST13FR_Biog_SeptoDur29 | Europe | France | 44.308 | 4.346 | Pure other | 0.765 | 10 | ||||||||||
| ST13FR_Biog_SeptoDur8 | Europe | France | 44.218 | 0.448 | Hybrid | 0.389 | 0.540 | 9 | |||||||||
| ST13FR_Biog_SeptoDur82 | Europe | France | 44.308 | 4.346 | Pure other | 0.834 | 10 | ||||||||||
| ST13FR_Biog_SeptoDur88 | Europe | France | 44.308 | 4.346 | Pure other | 0.880 | 10 | ||||||||||
| ST13IT_Biog_1819 | Europe | Italy | 41.891 | 12.492 | Hybrid | 0.542 | 0.245 | 9 | |||||||||
| ST13NZ_St13_5_4 | Oceania | New Zealand | -41.211 | 174.777 | Hybrid | 0.246 | 0.488 | 9 | |||||||||
| ST15NZ_St_15640_1 | Oceania | New Zealand | -40.134 | 175.658 | Hybrid | 0.270 | 0.562 | 9 | |||||||||
| ST15NZ_St_15640_2 | Oceania | New Zealand | -40.134 | 175.658 | Hybrid | 0.263 | 0.583 | 9 | |||||||||
| ST15NZ_St_15640_7 | Oceania | New Zealand | -40.134 | 175.658 | Hybrid | 0.300 | 0.525 | 9 | |||||||||
| ST15NZ_St_15640_8 | Oceania | New Zealand | -40.134 | 175.658 | Hybrid | 0.257 | 0.570 | 9 | |||||||||
| ST15NZ_St_15642_11 | Oceania | New Zealand | -46.219 | 169.492 | Hybrid | 0.562 | 0.204 | 9 | |||||||||
| ST15NZ_St_15642_12 | Oceania | New Zealand | -46.219 | 169.492 | Hybrid | 0.213 | 0.580 | 9 | |||||||||
| ST15NZ_St_15642_13 | Oceania | New Zealand | -46.219 | 169.492 | Hybrid | 0.220 | 0.593 | 9 | |||||||||
| ST_SRR2834990 | Oceania | Australia_NA | -35.296 | 149.122 | Pure other | 0.967 | 10 | ||||||||||
| ST_SRR2835000 | Oceania | Australia_NA | -35.296 | 149.122 | Pure other | 0.978 | 10 | ||||||||||
| ST_SRR2835057 | Oceania | Australia_NA | -35.296 | 149.122 | Pure other | 0.915 | 10 | ||||||||||
| STnnJGI_SRR3452689 | Middle East | Turkey | 39.931 | 32.833 | Hybrid | 0.252 | 0.622 | 9 | |||||||||
| STnnJGI_SRR3452700 | Africa | Algeria | 36.818 | 3.048 | Hybrid | 0.342 | 0.304 | 9 | |||||||||
| STnnJGI_SRR3452702 | Middle East | Syria | 33.517 | 36.316 | Hybrid | 0.259 | 0.564 | 9 | |||||||||
| STnnJGI_SRR3452704 | Africa | Algeria | 36.818 | 3.048 | Hybrid | 0.338 | 0.302 | 9 | |||||||||
| STnnJGI_SRR3452713 | Europe | France | 48.853 | 2.347 | Pure other | 0.885 | 10 | ||||||||||
| STnnJGI_SRR3452770 | Middle East | Syria | 33.517 | 36.316 | Hybrid | 0.319 | 0.635 | 9 | |||||||||
| STnnJGI_SRR3452776 | Africa | Tunisia | 36.810 | 10.179 | Pure other | 0.844 | 10 | ||||||||||
| STnnJGI_SRR4235083 | Africa | Algeria | 36.818 | 3.048 | Hybrid | 0.352 | 0.236 | 9 | |||||||||
| STnnJGI_SRR4235084 | Oceania | New Zealand | -41.211 | 174.777 | Hybrid | 0.405 | 0.292 | 0.249 | 8 | ||||||||
| STnnJGI_SRR4235085 | Africa | Tunisia | 36.810 | 10.179 | Hybrid | 0.401 | 0.236 | 9 | |||||||||
| STnnJGI_SRR4235089 | Oceania | New Zealand | -41.211 | 174.777 | Hybrid | 0.406 | 0.234 | 0.265 | 8 | ||||||||
| STnnJGI_SRR5194479 | Europe | Spain | 40.416 | -3.708 | Hybrid | 0.322 | 0.218 | 0.216 | 8 | ||||||||
| STnnJGI_SRR5194515 | Europe | Spain | 40.416 | -3.708 | Hybrid | 0.265 | 0.201 | 0.246 | 8 | ||||||||
| STnnJGI_SRR5194596 | Middle East | Iran | 35.727 | 51.385 | Hybrid | 0.222 | 0.384 | 9 | |||||||||
| STnnJGI_SRR5194607 | Middle East | Syria | 33.500 | 35.800 | Hybrid | 0.277 | 0.619 | 9 | |||||||||
| STnnJGI_SRR5829674 | Oceania | New Zealand | -41.211 | 174.777 | Hybrid | 0.440 | 0.249 | 0.264 | 8 | ||||||||
| Syria_1992_SYK2 | Middle East | Syria | 33.517 | 36.316 | Hybrid | 0.285 | 0.583 | 9 | |||||||||
| Syria_1992_SYT3 | Middle East | Syria | 36.010 | 36.940 | Hybrid | 0.239 | 0.596 | 9 |
write_tsv(x = temp, file = paste0(PopStr_dir, "Hybrids_and_pure_foreign.tab"))
# Simple map of the world
map1 = ggplot() +
theme_void() +
geom_polygon(data = map_data("world"), aes(x=long, y = lat, group = group), fill="#ede7e3") +
theme(panel.border = element_rect(fill=NA, colour = "grey",size = 0.5, linetype = 1))
#Splitting the map into our 3 main focus areas
p1 = map1 + coord_cartesian(xlim=c(-20, 60), ylim=c(20, 70)) +
geom_jitter(data = temp, aes(Longitude, Latitude, col = Status), width = 1, height = 1)
p2 = map1 + coord_cartesian(xlim=c(-160, -40), ylim=c(-80, 80))+
geom_jitter(data = temp, aes(Longitude, Latitude, col = Status), width = 1, height = 1)
p3 = map1 + coord_cartesian(xlim=c(115, 175), ylim=c(-65, 10)) +
geom_jitter(data = temp, aes(Longitude, Latitude, col = Status), width = 2, height = 2)
#Plotting all the maps together!
aus_map = cowplot::plot_grid(p3 + theme(legend.position = "none"), get_legend(p1), ncol = 2, rel_widths = c(1, 0.9))
ligne = cowplot::plot_grid(p1 + theme(legend.position = "none"), aus_map, ncol = 1, rel_heights = c(1, 0.7))
cowplot::plot_grid(p2 + theme(legend.position = "none"), ligne, ncol = 2, rel_widths = c(0.7, 1))
#ggsave(paste0(fig_dir, "Str_scatter_pie.pdf"), width = 18, height = 10, units = "cm")
As a first method to investigate the population structure of Z.tritici at the world-wide scale, I chose to do a principal component analysis based on a subset of the SNPs. This PCA confirms previous results of a geographically structured species. Oceania emerges quite distincly as three separate clusters: one in New-Zealand and two Australian (see below for a more in-depth analysis of this pattern). North-America is also quite serapate. The distinction between the rest of the geographical location is not clear in this PCA, although PC3 might show a slight differenciation between Europe and the Middle-East.
snpgdsVCF2GDS(paste0(vcf_dir, vcf_name, ".recode.vcf"),
paste0(PopStr_dir, vcf_name, ".recode.gds"), method="biallelic.only")
genofile <- snpgdsOpen(paste0(PopStr_dir, vcf_name, ".recode.gds"))
pca <-snpgdsPCA(genofile)
snpgdsClose(genofile)
pca2 = as_tibble(pca$eigenvect) %>% dplyr::select(V1:V8)
colnames(pca2) = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8")
pca2 = pca2 %>%
dplyr::mutate(sample_id = pca$sample.id ) %>%
dplyr::right_join(., status_admix, by = c("sample_id" = "Sample")) %>%
filter(!is.na(PC1))
#Writing table to store PCA results
pca2 %>%
dplyr::select(ID_file = sample_id, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8,
Continent, Country, Latitude, Longitude) %>%
write_tsv(., file = paste0(PopStr_dir, vcf_name, ".PCA_results.tab"),
col_names = T)
#
as.tibble(pca$eigenval[!is.na(pca$eigenval)]) %>%
ggplot(aes(x = c(1:length(pca$eigenval[!is.na(pca$eigenval)])),
y =value)) +
geom_point() +
theme_bw()
eigen_sum = sum(pca$eigenval[!is.na(pca$eigenval)])
for_plot = pca2 %>% mutate(Cluster = ifelse(Status == "Hybrid", Status, Cluster))
#Defining colors per cluster based on cluster
myColors2 <- c("grey", "#129eba", "#3F6E0C", "#DA4167", "#ffba0a", "#129eba", "#129eba",
"#8fa253", "#A20106", "#A20106", "#3F6E0C", "#8fa253")
myShapes <- c(1, 1, 1, 1, 1, 2, 0,
1, 1, 0, 0, 0)
names(myColors2) = levels(factor(for_plot$Cluster))
names(myShapes) = levels(factor(for_plot$Cluster))
Color_Cluster = ggplot2::scale_colour_manual(name = "Cluster", values = myColors2)
Fill_Cluster = ggplot2::scale_fill_manual(name = "Cluster", values = myColors2)
Shape_Cluster = ggplot2::scale_shape_manual(name = "Cluster", values = myShapes)
# Dot plots
p1 = ggplot(for_plot, aes(x = PC1, y= PC2, shape = Cluster)) +
geom_point(aes(color = Cluster)) +
labs(x = paste0("PC 1 (", round(pca$eigenval[1]*100/eigen_sum, 2), "%)"),
y = paste0("PC 2 (", round(pca$eigenval[2]*100/eigen_sum, 2), "%)")) +
Color_Cluster + Shape_Cluster +
theme_bw()
p2 = ggplot(for_plot, aes(x = PC3, y= PC4, shape = Cluster)) +
geom_point(aes(color = Cluster)) +
labs(x = paste0("PC 3 (", round(pca$eigenval[3]*100/eigen_sum, 2), "%)"),
y = paste0("PC 4 (", round(pca$eigenval[4]*100/eigen_sum, 2), "%)")) +
Color_Cluster + Shape_Cluster +
theme_bw()
p3 = ggplot(for_plot, aes(x = PC5, y= PC6, shape = Cluster)) +
geom_point(aes(color = Cluster)) +
labs(x = paste0("PC 5 (", round(pca$eigenval[5]*100/eigen_sum, 2), "%)"),
y = paste0("PC 6 (", round(pca$eigenval[6]*100/eigen_sum, 2), "%)")) +
Color_Cluster + Shape_Cluster +
theme_bw()
p4 = ggplot(for_plot, aes(x = PC7, y= PC8, shape = Cluster)) +
geom_point(aes(color = Cluster)) +
labs(x = paste0("PC 7 (", round(pca$eigenval[7]*100/eigen_sum, 2), "%)"),
y = paste0("PC 8 (", round(pca$eigenval[8]*100/eigen_sum, 2), "%)")) +
Color_Cluster + Shape_Cluster +
theme_bw()
PCA_grid = cowplot::plot_grid(p1 + theme(legend.position = "none"), p2 + theme(legend.position = "none"),
p3 + theme(legend.position = "none"), p4 + theme(legend.position = "none"))
cowplot::plot_grid(PCA_grid, get_legend(p1 + theme(legend.position = "bottom")), ncol =1, rel_heights = c(1, 0.2))
p11 = filter(for_plot, Cluster != "Hybrid") %>%
ggplot(aes(PC1, color = Cluster, fill = Cluster)) +
geom_density(alpha = .3)+
Color_Cluster + Fill_Cluster +
theme_cowplot() +
theme(axis.title = element_blank(), axis.text = element_blank(),
plot.margin = unit(c(0,0,0,0), "cm"), axis.ticks = element_blank())
p12 = filter(for_plot, Cluster != "Hybrid") %>%
ggplot(aes(PC2, color = Cluster, fill = Cluster)) +
geom_density(alpha = .3)+
Color_Cluster + Fill_Cluster +
theme_cowplot() +
theme(axis.title = element_blank(), axis.text = element_blank(),
plot.margin = unit(c(0,0,0,0), "cm"), axis.ticks = element_blank())
p13 = filter(for_plot, Cluster != "Hybrid") %>%
ggplot(aes(PC3, color = Cluster, fill = Cluster)) +
geom_density(alpha = .3)+
Color_Cluster + Fill_Cluster +
theme_cowplot() +
theme(axis.title = element_blank(), axis.text = element_blank(),
plot.margin = unit(c(0,0,0,0), "cm"), axis.ticks = element_blank())
cowplot::plot_grid(p11+ theme(legend.position = "none"),
p12 + theme(legend.position = "none"),
p13 + theme(legend.position = "none"),
ncol = 1, align = "hv")
ggMarginal(ggplot(for_plot, aes(x = PC2, y= PC3, shape = Cluster)) +
geom_point(aes(color = Cluster)) +
labs(x = paste0("PC 2 (", round(pca$eigenval[2]*100/eigen_sum, 2), "%)"),
y = paste0("PC 3 (", round(pca$eigenval[3]*100/eigen_sum, 2), "%)")) +
Color_Cluster + Shape_Cluster +
theme_bw() + theme(legend.position = "bottom"), groupColour = TRUE, groupFill = TRUE)
ggplot2::theme_set(theme_light())
p = ggpairs(pca2, columns = c(1:6),
ggplot2::aes(col=Continent, fill = Continent, alpha = 0.6),
title = "PCA based thinned SNPs",
upper = list(continuous = "points", combo = "box_no_facet"))
for(i in 1:p$nrow) {
for(j in 1:p$ncol){
p[i,j] <- p[i,j] + theme_light() + Color_Continent + Fill_Continent
}
}
p
In the steps before, I have learned about population history indirectly by inferring genetic populations from the genomic data. The relationship between the population and the underlying demography is not explicit in these however. It is possible however to infer splits between populations and create a population tree. Here, I use treemix, which takes into account the possibility of gene flow between populations and indeed test of it in the process of creating a population tree.
Because the populations in the clustering were not perfectly distinct from one another, I start with “discretized” populations by choosing only the isolates with high ancestry in one of the sNMF clusters.
~/Documents/Software/vcftools_jydu/src/cpp/vcftools \
--vcf ${VCFDIR}$VCFNAME.recode.vcf \
--keep ${POPSTR}$VCFNAME.high_anc_coef_snmf.ind \
--remove-filtered-all --extract-FORMAT-info GT \
--max-missing 1.0 --min-alleles 2 --max-alleles 2 \
--maf 0.05 \
--out ${POPSTR}$VCFNAME.high_anc_coef_snmf
cat ${POPSTR}$VCFNAME.high_anc_coef_snmf.GT.FORMAT | cut -f 3- \
> ${POPSTR}$VCFNAME.high_anc_coef_snmf.GT.FORMAT2
#conda activate r-reticulate
from collections import defaultdict
#For each isolate, store its pop (as in sampling site) in a dictionary
dict_pop = dict(zip(r.high_anc_coef_snmf["Sample"],
r.high_anc_coef_snmf["Cluster"]))
#Keep a list of the pop names/coordinates to write in the same order later
all_pops = sorted(list(set(r.high_anc_coef_snmf["Cluster"])))
#out_name = r.PopStr_dir + r.vcf_name + ".high_anc_coef_snmf.treemix"
out_name = "/Users/afeurtey/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp.high_anc_coef_snmf.treemix"
out = open(out_name, "w")
shutup = out.write(" ".join(all_pops) + "\n")
#with open(r.PopStr_dir + r.vcf_name + ".high_anc_coef_snmf.GT.FORMAT2", "r") as input_snps :
with open("/Users/afeurtey/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp.high_anc_coef_snmf.GT.FORMAT2", "r") as input_snps :
for i, snp in enumerate(input_snps) :
#Setting two dictionaries with values at 0
dict_snp0 = defaultdict(int)
dict_snp1 = defaultdict(int)
Lets_write = True
#The first line is the name of the isolates
if i == 0 :
indv = snp.strip().split("\t")
Lets_write = False
else :
#Keeping isolate name and allelic value together
alleles = zip(indv, snp.strip().split("\t"))
#...and counting the O and 1 based on the pop
for ind, allele in alleles:
if allele == "0" :
dict_snp0[dict_pop[ind]] += 1
elif allele == "1" :
dict_snp1[dict_pop[ind]] += 1
else :
print("Only biallelic please!!!!")
Lets_write = False
#If I have not found anything weird, I will write the result to the output file.
if Lets_write :
shutup = out.write(" ".join([",".join([str(dict_snp0[pop]), str(dict_snp1[pop])]) for pop in all_pops]) + "\n")
print("All done!")
out.close()
POPSTR="/Users/afeurtey/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/"
if [ -f ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix ] ;
then
gzip ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix
fi
treemix \
-i ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix.gz \
-o ${POPSTR}$VCFNAME.high_anc_coef_snmf.m0.treemix.out \
-m 0 -root V7
treemix \
-i ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix.gz \
-o ${POPSTR}$VCFNAME.high_anc_coef_snmf.m1.treemix.out \
-m 1 -root V7
treemix \
-i ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix.gz \
-o ${POPSTR}$VCFNAME.high_anc_coef_snmf.m2.treemix.out \
-m 2 -root V7
treemix \
-i ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix.gz \
-o ${POPSTR}$VCFNAME.high_anc_coef_snmf.m3.treemix.out \
-m 3 -root V7
treemix \
-i ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix.gz \
-o ${POPSTR}$VCFNAME.high_anc_coef_snmf.m4.treemix.out \
-m 4 -root V7
treemix \
-i ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix.gz \
-o ${POPSTR}$VCFNAME.high_anc_coef_snmf.m5.treemix.out \
-m 5 -root V7
treemix \
-i ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix.gz \
-o ${POPSTR}$VCFNAME.high_anc_coef_snmf.m6.treemix.out \
-m 6 -root V7
source("~/Documents/Software/treemix-1.13/src/plotting_funcs.R")
p_cluster
t = plot_tree(paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.m0.treemix.out"))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 0 <NA> NOT_ROOT NOT_MIG NOT_TIP 16 75 1 211 1
## 2 1 V5 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 3 2 <NA> NOT_ROOT NOT_MIG NOT_TIP 76 3 1 1 1
## 4 3 V9 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 5 4 V6 NOT_ROOT NOT_MIG TIP 52 NA NA NA NA
## 6 15 V3 NOT_ROOT NOT_MIG TIP 32 NA NA NA NA
## 7 16 <NA> NOT_ROOT NOT_MIG NOT_TIP 76 172 4 0 2
## 8 31 V1 NOT_ROOT NOT_MIG TIP 104 NA NA NA NA
## 9 32 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 104 2 15 1
## 10 51 V7 NOT_ROOT NOT_MIG TIP 213 NA NA NA NA
## 11 52 <NA> NOT_ROOT NOT_MIG NOT_TIP 213 4 1 136 9
## 12 75 V8 NOT_ROOT NOT_MIG TIP 0 NA NA NA NA
## 13 76 <NA> NOT_ROOT NOT_MIG NOT_TIP 136 2 2 16 6
## 14 103 V4 NOT_ROOT NOT_MIG TIP 104 NA NA NA NA
## 15 104 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 31 1 103 1
## 16 135 V11 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 17 136 <NA> NOT_ROOT NOT_MIG NOT_TIP 52 135 1 76 8
## 18 171 V2 NOT_ROOT NOT_MIG TIP 172 NA NA NA NA
## 19 172 <NA> NOT_ROOT NOT_MIG NOT_TIP 16 32 3 171 1
## 20 211 V10 NOT_ROOT NOT_MIG TIP 0 NA NA NA NA
## 21 213 <NA> ROOT NOT_MIG NOT_TIP 213 51 1 52 10
## V11
## 1 (V8:0.0462969,V10:0.0279962):0.0095659
## 2 V5:0.0114564
## 3 (V9:0.031812,V5:0.0114564):0.00720251
## 4 V9:0.031812
## 5 V6:0.00531978
## 6 V3:0.031223
## 7 ((((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.004535,(V8:0.0462969,V10:0.0279962):0.0095659):0.0037794
## 8 V1:0.045482
## 9 ((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577
## 10 V7:0.0137477
## 11 (V6:0.00531978,(V11:0.0239367,((V9:0.031812,V5:0.0114564):0.00720251,((((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.004535,(V8:0.0462969,V10:0.0279962):0.0095659):0.0037794):0.018261):0.0118456):0.0137477
## 12 V8:0.0462969
## 13 ((V9:0.031812,V5:0.0114564):0.00720251,((((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.004535,(V8:0.0462969,V10:0.0279962):0.0095659):0.0037794):0.018261
## 14 V4:0.0536008
## 15 (V1:0.045482,V4:0.0536008):0.0126766
## 16 V11:0.0239367
## 17 (V11:0.0239367,((V9:0.031812,V5:0.0114564):0.00720251,((((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.004535,(V8:0.0462969,V10:0.0279962):0.0095659):0.0037794):0.018261):0.0118456
## 18 V2:0.000271025
## 19 (((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.004535
## 20 V10:0.0279962
## 21 (V7:0.0137477,(V6:0.00531978,(V11:0.0239367,((V9:0.031812,V5:0.0114564):0.00720251,((((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.004535,(V8:0.0462969,V10:0.0279962):0.0095659):0.0037794):0.018261):0.0118456):0.0137477);
## x y ymin ymax
## 1 0.05719960 0.09090909 0.00000000 0.18181818
## 2 0.06251321 0.59090909 0.54545455 0.63636364
## 3 0.05105681 0.63636364 0.54545455 0.72727273
## 4 0.08286881 0.68181818 0.63636364 0.72727273
## 5 0.01906748 0.86363636 0.81818182 0.90909091
## 6 0.09325747 0.31818182 0.27272727 0.36363636
## 7 0.04763370 0.18181818 0.00000000 0.54545455
## 8 0.12019307 0.50000000 0.45454545 0.54545455
## 9 0.06203447 0.36363636 0.27272727 0.54545455
## 10 0.01374770 0.95454545 0.90909091 1.00000000
## 11 0.01374770 0.81818182 0.00000000 0.90909091
## 12 0.10349650 0.13636364 0.09090909 0.18181818
## 13 0.04385430 0.54545455 0.00000000 0.72727273
## 14 0.12831187 0.40909091 0.36363636 0.45454545
## 15 0.07471107 0.45454545 0.36363636 0.54545455
## 16 0.04953000 0.77272727 0.72727273 0.81818182
## 17 0.02559330 0.72727273 0.00000000 0.81818182
## 18 0.05243972 0.22727273 0.18181818 0.27272727
## 19 0.05216870 0.27272727 0.18181818 0.54545455
## 20 0.08519580 0.04545455 0.00000000 0.09090909
## 21 0.00000000 0.90909091 0.00000000 1.00000000
## [1] 0.06251321 0.08286881 0.01906748 0.09325747 0.12019307 0.01374770
## [7] 0.10349650 0.12831187 0.04953000 0.05243972 0.08519580
## [1] 0.003
## [1] "mse 0.000426880520661157"
t = plot_tree(paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.m1.treemix.out"))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 0 <NA> NOT_ROOT NOT_MIG NOT_TIP 16 211 1 75 1
## 2 1 V5 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 3 2 <NA> NOT_ROOT NOT_MIG NOT_TIP 76 3 1 1 1
## 4 3 V9 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 5 4 V6 NOT_ROOT NOT_MIG TIP 52 NA NA NA NA
## 6 15 V3 NOT_ROOT NOT_MIG TIP 32 NA NA NA NA
## 7 16 <NA> NOT_ROOT NOT_MIG NOT_TIP 76 172 4 0 2
## 8 31 V1 NOT_ROOT NOT_MIG TIP 104 NA NA NA NA
## 9 32 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 104 2 15 1
## 10 51 V7 NOT_ROOT NOT_MIG TIP 213 NA NA NA NA
## 11 52 <NA> NOT_ROOT NOT_MIG NOT_TIP 213 4 1 136 9
## 12 75 V8 NOT_ROOT NOT_MIG TIP 0 NA NA NA NA
## 13 76 <NA> NOT_ROOT NOT_MIG NOT_TIP 136 2 2 16 6
## 14 103 V4 NOT_ROOT NOT_MIG TIP 104 NA NA NA NA
## 15 104 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 31 1 103 1
## 16 135 V11 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 17 136 <NA> NOT_ROOT NOT_MIG NOT_TIP 52 76 8 135 1
## 18 171 V2 NOT_ROOT NOT_MIG TIP 172 NA NA NA NA
## 19 172 <NA> NOT_ROOT NOT_MIG NOT_TIP 16 32 3 171 1
## 20 211 V10 NOT_ROOT NOT_MIG TIP 0 NA NA NA NA
## 21 213 <NA> ROOT NOT_MIG NOT_TIP 213 51 1 52 10
## 22 216 <NA> NOT_ROOT MIG NOT_TIP 52 136 NA NA NA
## V11
## 1 (V10:0.0156642,V8:0.132024):0.0214575
## 2 V5:0.0114564
## 3 (V9:0.031812,V5:0.0114564):0.00582432
## 4 V9:0.031812
## 5 V6:0.00531978
## 6 V3:0.031223
## 7 ((((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.000670075,(V10:0.0156642,V8:0.132024):0.0214575):0.00740961
## 8 V1:0.045482
## 9 ((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577
## 10 V7:0.0137477
## 11 (V6:0.00531978,(((V9:0.031812,V5:0.0114564):0.00582432,((((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.000670075,(V10:0.0156642,V8:0.132024):0.0214575):0.00740961):0.0196059,V11:0.0237574):0.0119322):0.0137477
## 12 V8:0.132024
## 13 ((V9:0.031812,V5:0.0114564):0.00582432,((((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.000670075,(V10:0.0156642,V8:0.132024):0.0214575):0.00740961):0.0196059
## 14 V4:0.0536008
## 15 (V1:0.045482,V4:0.0536008):0.0126766
## 16 V11:0.0237574
## 17 (((V9:0.031812,V5:0.0114564):0.00582432,((((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.000670075,(V10:0.0156642,V8:0.132024):0.0214575):0.00740961):0.0196059,V11:0.0237574):0.0119322
## 18 V2:0.000271025
## 19 (((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.000670075
## 20 V10:0.0156642
## 21 (V7:0.0137477,(V6:0.00531978,(((V9:0.031812,V5:0.0114564):0.00582432,((((V1:0.045482,V4:0.0536008):0.0126766,V3:0.031223):0.00986577,V2:0.000271025):0.000670075,(V10:0.0156642,V8:0.132024):0.0214575):0.00740961):0.0196059,V11:0.0237574):0.0119322):0.0137477);
## 22 <NA>
## x y ymin ymax
## 1 0.07415296 0.18181818 0.09090909 0.27272727
## 2 0.06256657 0.68181818 0.63636364 0.72727273
## 3 0.05111017 0.72727273 0.63636364 0.81818182
## 4 0.08292217 0.77272727 0.72727273 0.81818182
## 5 0.01906748 0.86363636 0.81818182 0.90909091
## 6 0.09445431 0.40909091 0.36363636 0.45454545
## 7 0.05269546 0.27272727 0.09090909 0.63636364
## 8 0.12138990 0.59090909 0.54545455 0.63636364
## 9 0.06323131 0.45454545 0.36363636 0.63636364
## 10 0.01374770 0.95454545 0.90909091 1.00000000
## 11 0.01374770 0.81818182 0.00000000 0.90909091
## 12 0.12549735 0.13636364 0.09090909 0.18181818
## 13 0.04528585 0.63636364 0.09090909 0.81818182
## 14 0.12950871 0.50000000 0.45454545 0.54545455
## 15 0.07590790 0.54545455 0.45454545 0.63636364
## 16 0.04943735 0.04545455 0.00000000 0.09090909
## 17 0.02567995 0.09090909 0.00000000 0.81818182
## 18 0.05363656 0.31818182 0.27272727 0.36363636
## 19 0.05336553 0.36363636 0.27272727 0.63636364
## 20 0.08981716 0.22727273 0.18181818 0.27272727
## 21 0.00000000 0.90909091 0.00000000 1.00000000
## 22 0.02273023 NA 0.00000000 0.81818182
## [1] "0.752794 0.0137477 0.02567995"
## [1] 0.06256657 0.08292217 0.01906748 0.09445431 0.12138990 0.01374770
## [7] 0.12549735 0.12950871 0.04943735 0.05363656 0.08981716
## [1] 0.003
## [1] "mse 0.000426880520661157"
t = plot_tree(paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.m2.treemix.out"))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 0 <NA> NOT_ROOT NOT_MIG NOT_TIP 16 211 1 75 1
## 2 1 V5 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 3 2 <NA> NOT_ROOT NOT_MIG NOT_TIP 76 1 1 3 1
## 4 3 V9 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 5 4 V6 NOT_ROOT NOT_MIG TIP 52 NA NA NA NA
## 6 15 V3 NOT_ROOT NOT_MIG TIP 32 NA NA NA NA
## 7 16 <NA> NOT_ROOT NOT_MIG NOT_TIP 76 0 2 172 4
## 8 31 V1 NOT_ROOT NOT_MIG TIP 104 NA NA NA NA
## 9 32 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 15 1 104 2
## 10 51 V7 NOT_ROOT NOT_MIG TIP 213 NA NA NA NA
## 11 52 <NA> NOT_ROOT NOT_MIG NOT_TIP 213 4 1 136 9
## 12 75 V8 NOT_ROOT NOT_MIG TIP 0 NA NA NA NA
## 13 76 <NA> NOT_ROOT NOT_MIG NOT_TIP 136 16 6 2 2
## 14 103 V4 NOT_ROOT NOT_MIG TIP 104 NA NA NA NA
## 15 104 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 31 1 103 1
## 16 135 V11 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 17 136 <NA> NOT_ROOT NOT_MIG NOT_TIP 52 135 1 76 8
## 18 171 V2 NOT_ROOT NOT_MIG TIP 172 NA NA NA NA
## 19 172 <NA> NOT_ROOT NOT_MIG NOT_TIP 16 171 1 32 3
## 20 211 V10 NOT_ROOT NOT_MIG TIP 0 NA NA NA NA
## 21 213 <NA> ROOT NOT_MIG NOT_TIP 213 52 10 51 1
## 22 263 <NA> NOT_ROOT MIG NOT_TIP 104 103 NA NA NA
## 23 216 <NA> NOT_ROOT MIG NOT_TIP 136 76 NA NA NA
## V11
## 1 (V10:0.0116286,V8:0.192552):0.024709
## 2 V5:0.00666147
## 3 (V5:0.00666147,V9:0.052367):0.0102706
## 4 V9:0.052367
## 5 V6:0.00532122
## 6 V3:0.0306167
## 7 ((V10:0.0116286,V8:0.192552):0.024709,(V2:0,(V3:0.0306167,(V1:0.0451691,V4:0.0541295):0.0130093):0.0105408):0.000836191):0.00956141
## 8 V1:0.0451691
## 9 (V3:0.0306167,(V1:0.0451691,V4:0.0541295):0.0130093):0.0105408
## 10 V7:0.0137485
## 11 (V6:0.00532122,(V11:0.0239381,(((V10:0.0116286,V8:0.192552):0.024709,(V2:0,(V3:0.0306167,(V1:0.0451691,V4:0.0541295):0.0130093):0.0105408):0.000836191):0.00956141,(V5:0.00666147,V9:0.052367):0.0102706):0.0171301):0.0118456):0.0137485
## 12 V8:0.192552
## 13 (((V10:0.0116286,V8:0.192552):0.024709,(V2:0,(V3:0.0306167,(V1:0.0451691,V4:0.0541295):0.0130093):0.0105408):0.000836191):0.00956141,(V5:0.00666147,V9:0.052367):0.0102706):0.0171301
## 14 V4:0.0541295
## 15 (V1:0.0451691,V4:0.0541295):0.0130093
## 16 V11:0.0239381
## 17 (V11:0.0239381,(((V10:0.0116286,V8:0.192552):0.024709,(V2:0,(V3:0.0306167,(V1:0.0451691,V4:0.0541295):0.0130093):0.0105408):0.000836191):0.00956141,(V5:0.00666147,V9:0.052367):0.0102706):0.0171301):0.0118456
## 18 V2:0
## 19 (V2:0,(V3:0.0306167,(V1:0.0451691,V4:0.0541295):0.0130093):0.0105408):0.000836191
## 20 V10:0.0116286
## 21 ((V6:0.00532122,(V11:0.0239381,(((V10:0.0116286,V8:0.192552):0.024709,(V2:0,(V3:0.0306167,(V1:0.0451691,V4:0.0541295):0.0130093):0.0105408):0.000836191):0.00956141,(V5:0.00666147,V9:0.052367):0.0102706):0.0171301):0.0118456):0.0137485,V7:0.0137485);
## 22 <NA>
## 23 <NA>
## x y ymin ymax
## 1 0.07699463 0.72727273 0.63636364 0.81818182
## 2 0.05965629 0.22727273 0.18181818 0.27272727
## 3 0.05299482 0.18181818 0.09090909 0.27272727
## 4 0.08726837 0.13636364 0.09090909 0.18181818
## 5 0.01906972 0.95454545 0.90909091 1.00000000
## 6 0.09427932 0.50000000 0.45454545 0.54545455
## 7 0.05228563 0.63636364 0.27272727 0.81818182
## 8 0.12184102 0.40909091 0.36363636 0.45454545
## 9 0.06366262 0.45454545 0.27272727 0.54545455
## 10 0.01374850 0.04545455 0.00000000 0.09090909
## 11 0.01374850 0.90909091 0.09090909 1.00000000
## 12 0.12880987 0.68181818 0.63636364 0.72727273
## 13 0.04272422 0.27272727 0.09090909 0.81818182
## 14 0.13080142 0.31818182 0.27272727 0.36363636
## 15 0.07667192 0.36363636 0.27272727 0.45454545
## 16 0.04953220 0.86363636 0.81818182 0.90909091
## 17 0.02559410 0.81818182 0.09090909 0.90909091
## 18 0.05312182 0.59090909 0.54545455 0.63636364
## 19 0.05312182 0.54545455 0.27272727 0.63636364
## 20 0.08862323 0.77272727 0.72727273 0.81818182
## 21 0.00000000 0.09090909 0.00000000 1.00000000
## 22 0.09649812 NA 0.27272727 0.36363636
## 23 0.03013852 NA 0.09090909 0.81818182
## [1] "0.366274 0.076671921 0.130801421"
## [1] "0.265289 0.0255941 0.04272422"
## [1] 0.05965629 0.08726837 0.01906972 0.09427932 0.12184102 0.01374850
## [7] 0.12880987 0.13080142 0.04953220 0.05312182 0.08862323
## [1] 0.003
## [1] "mse 0.000426880520661157"
t = plot_tree(paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.m3.treemix.out"))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 0 <NA> NOT_ROOT NOT_MIG NOT_TIP 16 211 1 75 1
## 2 1 V5 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 3 2 <NA> NOT_ROOT NOT_MIG NOT_TIP 76 1 1 3 1
## 4 3 V9 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 5 4 V6 NOT_ROOT NOT_MIG TIP 52 NA NA NA NA
## 6 15 V3 NOT_ROOT NOT_MIG TIP 32 NA NA NA NA
## 7 16 <NA> NOT_ROOT NOT_MIG NOT_TIP 76 0 2 172 4
## 8 31 V1 NOT_ROOT NOT_MIG TIP 104 NA NA NA NA
## 9 32 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 15 1 171 1
## 10 51 V7 NOT_ROOT NOT_MIG TIP 213 NA NA NA NA
## 11 52 <NA> NOT_ROOT NOT_MIG NOT_TIP 213 4 1 136 9
## 12 75 V8 NOT_ROOT NOT_MIG TIP 0 NA NA NA NA
## 13 76 <NA> NOT_ROOT NOT_MIG NOT_TIP 136 16 6 2 2
## 14 103 V4 NOT_ROOT NOT_MIG TIP 104 NA NA NA NA
## 15 104 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 31 1 103 1
## 16 135 V11 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 17 136 <NA> NOT_ROOT NOT_MIG NOT_TIP 52 135 1 76 8
## 18 171 V2 NOT_ROOT NOT_MIG TIP 32 NA NA NA NA
## 19 172 <NA> NOT_ROOT NOT_MIG NOT_TIP 16 104 2 32 2
## 20 211 V10 NOT_ROOT NOT_MIG TIP 0 NA NA NA NA
## 21 213 <NA> ROOT NOT_MIG NOT_TIP 213 52 10 51 1
## 22 263 <NA> NOT_ROOT MIG NOT_TIP 104 103 NA NA NA
## 23 216 <NA> NOT_ROOT MIG NOT_TIP 136 76 NA NA NA
## 24 289 <NA> NOT_ROOT MIG NOT_TIP 104 103 NA NA NA
## V11
## 1 (V10:0.0117506,V8:0.191344):0.0245533
## 2 V5:0.0070738
## 3 (V5:0.0070738,V9:0.0489904):0.00987258
## 4 V9:0.0489904
## 5 V6:0.00532132
## 6 V3:0.0684707
## 7 ((V10:0.0117506,V8:0.191344):0.0245533,((V1:0.0440156,V4:0.0552927):0.0235787,(V3:0.0684707,V2:0):0):0.000833902):0.0094842
## 8 V1:0.0440156
## 9 (V3:0.0684707,V2:0):0
## 10 V7:0.0137485
## 11 (V6:0.00532132,(V11:0.0239382,(((V10:0.0117506,V8:0.191344):0.0245533,((V1:0.0440156,V4:0.0552927):0.0235787,(V3:0.0684707,V2:0):0):0.000833902):0.0094842,(V5:0.0070738,V9:0.0489904):0.00987258):0.0171909):0.0118456):0.0137485
## 12 V8:0.191344
## 13 (((V10:0.0117506,V8:0.191344):0.0245533,((V1:0.0440156,V4:0.0552927):0.0235787,(V3:0.0684707,V2:0):0):0.000833902):0.0094842,(V5:0.0070738,V9:0.0489904):0.00987258):0.0171909
## 14 V4:0.0552927
## 15 (V1:0.0440156,V4:0.0552927):0.0235787
## 16 V11:0.0239382
## 17 (V11:0.0239382,(((V10:0.0117506,V8:0.191344):0.0245533,((V1:0.0440156,V4:0.0552927):0.0235787,(V3:0.0684707,V2:0):0):0.000833902):0.0094842,(V5:0.0070738,V9:0.0489904):0.00987258):0.0171909):0.0118456
## 18 V2:0
## 19 ((V1:0.0440156,V4:0.0552927):0.0235787,(V3:0.0684707,V2:0):0):0.000833902
## 20 V10:0.0117506
## 21 ((V6:0.00532132,(V11:0.0239382,(((V10:0.0117506,V8:0.191344):0.0245533,((V1:0.0440156,V4:0.0552927):0.0235787,(V3:0.0684707,V2:0):0):0.000833902):0.0094842,(V5:0.0070738,V9:0.0489904):0.00987258):0.0171909):0.0118456):0.0137485,V7:0.0137485);
## 22 <NA>
## 23 <NA>
## 24 <NA>
## x y ymin ymax
## 1 0.07682254 0.72727273 0.63636364 0.81818182
## 2 0.05973142 0.22727273 0.18181818 0.27272727
## 3 0.05265762 0.18181818 0.09090909 0.27272727
## 4 0.08677643 0.13636364 0.09090909 0.18181818
## 5 0.01906982 0.95454545 0.90909091 1.00000000
## 6 0.09041198 0.40909091 0.36363636 0.45454545
## 7 0.05226924 0.63636364 0.27272727 0.81818182
## 8 0.12069744 0.59090909 0.54545455 0.63636364
## 9 0.05310314 0.36363636 0.27272727 0.45454545
## 10 0.01374850 0.04545455 0.00000000 0.09090909
## 11 0.01374850 0.90909091 0.09090909 1.00000000
## 12 0.12862167 0.68181818 0.63636364 0.72727273
## 13 0.04278504 0.27272727 0.09090909 0.81818182
## 14 0.13197446 0.50000000 0.45454545 0.54545455
## 15 0.07668184 0.54545455 0.45454545 0.63636364
## 16 0.04953230 0.86363636 0.81818182 0.90909091
## 17 0.02559410 0.81818182 0.09090909 0.90909091
## 18 0.05310314 0.31818182 0.27272727 0.36363636
## 19 0.05310314 0.45454545 0.27272727 0.63636364
## 20 0.08857314 0.77272727 0.72727273 0.81818182
## 21 0.00000000 0.09090909 0.00000000 1.00000000
## 22 0.10884164 NA 0.45454545 0.54545455
## 23 0.03012114 NA 0.09090909 0.81818182
## 24 0.13197446 NA 0.45454545 0.54545455
## [1] "0.581629 0.076681842 0.131974462"
## [1] "0.263339 0.0255941 0.04278504"
## [1] "0.614093 0.076681842 0.131974462"
## [1] 0.05973142 0.08677643 0.01906982 0.09041198 0.12069744 0.01374850
## [7] 0.12862167 0.13197446 0.04953230 0.05310314 0.08857314
## [1] 0.003
## [1] "mse 0.000426880520661157"
t = plot_tree(paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.m4.treemix.out"))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 0 <NA> NOT_ROOT NOT_MIG NOT_TIP 16 211 1 75 1
## 2 1 V5 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 3 2 <NA> NOT_ROOT NOT_MIG NOT_TIP 76 1 1 3 1
## 4 3 V9 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 5 4 V6 NOT_ROOT NOT_MIG TIP 52 NA NA NA NA
## 6 15 V3 NOT_ROOT NOT_MIG TIP 32 NA NA NA NA
## 7 16 <NA> NOT_ROOT NOT_MIG NOT_TIP 76 172 4 0 2
## 8 31 V1 NOT_ROOT NOT_MIG TIP 104 NA NA NA NA
## 9 32 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 171 1 15 1
## 10 51 V7 NOT_ROOT NOT_MIG TIP 213 NA NA NA NA
## 11 52 <NA> NOT_ROOT NOT_MIG NOT_TIP 213 136 9 4 1
## 12 75 V8 NOT_ROOT NOT_MIG TIP 0 NA NA NA NA
## 13 76 <NA> NOT_ROOT NOT_MIG NOT_TIP 136 2 2 16 6
## 14 103 V4 NOT_ROOT NOT_MIG TIP 104 NA NA NA NA
## 15 104 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 31 1 103 1
## 16 135 V11 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 17 136 <NA> NOT_ROOT NOT_MIG NOT_TIP 52 135 1 76 8
## 18 171 V2 NOT_ROOT NOT_MIG TIP 32 NA NA NA NA
## 19 172 <NA> NOT_ROOT NOT_MIG NOT_TIP 16 104 2 32 2
## 20 211 V10 NOT_ROOT NOT_MIG TIP 0 NA NA NA NA
## 21 213 <NA> ROOT NOT_MIG NOT_TIP 213 51 1 52 10
## 22 263 <NA> NOT_ROOT MIG NOT_TIP 104 103 NA NA NA
## 23 216 <NA> NOT_ROOT MIG NOT_TIP 136 76 NA NA NA
## 24 289 <NA> NOT_ROOT MIG NOT_TIP 104 103 NA NA NA
## 25 336 <NA> NOT_ROOT MIG NOT_TIP 2 3 NA NA NA
## V11
## 1 (V10:0.0111877,V8:0.201105):0.0250823
## 2 V5:0.00656589
## 3 (V5:0.00656589,V9:0.0510023):0.0105293
## 4 V9:0.0510023
## 5 V6:0.00532145
## 6 V3:0.0688054
## 7 (((V1:0.0440791,V4:0.0553094):0.0235664,(V2:0,V3:0.0688054):0):0.000855883,(V10:0.0111877,V8:0.201105):0.0250823):0.0092669
## 8 V1:0.0440791
## 9 (V2:0,V3:0.0688054):0
## 10 V7:0.0137486
## 11 ((V11:0.0262863,((V5:0.00656589,V9:0.0510023):0.0105293,(((V1:0.0440791,V4:0.0553094):0.0235664,(V2:0,V3:0.0688054):0):0.000855883,(V10:0.0111877,V8:0.201105):0.0250823):0.0092669):0.0180456):0.0110565,V6:0.00532145):0.0137486
## 12 V8:0.201105
## 13 ((V5:0.00656589,V9:0.0510023):0.0105293,(((V1:0.0440791,V4:0.0553094):0.0235664,(V2:0,V3:0.0688054):0):0.000855883,(V10:0.0111877,V8:0.201105):0.0250823):0.0092669):0.0180456
## 14 V4:0.0553094
## 15 (V1:0.0440791,V4:0.0553094):0.0235664
## 16 V11:0.0262863
## 17 (V11:0.0262863,((V5:0.00656589,V9:0.0510023):0.0105293,(((V1:0.0440791,V4:0.0553094):0.0235664,(V2:0,V3:0.0688054):0):0.000855883,(V10:0.0111877,V8:0.201105):0.0250823):0.0092669):0.0180456):0.0110565
## 18 V2:0
## 19 ((V1:0.0440791,V4:0.0553094):0.0235664,(V2:0,V3:0.0688054):0):0.000855883
## 20 V10:0.0111877
## 21 (V7:0.0137486,((V11:0.0262863,((V5:0.00656589,V9:0.0510023):0.0105293,(((V1:0.0440791,V4:0.0553094):0.0235664,(V2:0,V3:0.0688054):0):0.000855883,(V10:0.0111877,V8:0.201105):0.0250823):0.0092669):0.0180456):0.0110565,V6:0.00532145):0.0137486);
## 22 <NA>
## 23 <NA>
## 24 <NA>
## 25 <NA>
## x y ymin ymax
## 1 0.07719982 0.18181818 0.09090909 0.27272727
## 2 0.05994581 0.77272727 0.72727273 0.81818182
## 3 0.05337992 0.72727273 0.63636364 0.81818182
## 4 0.08775739 0.68181818 0.63636364 0.72727273
## 5 0.01907005 0.04545455 0.00000000 0.09090909
## 6 0.09024684 0.31818182 0.27272727 0.36363636
## 7 0.05211752 0.27272727 0.09090909 0.63636364
## 8 0.12061890 0.59090909 0.54545455 0.63636364
## 9 0.05297340 0.36363636 0.27272727 0.45454545
## 10 0.01374860 0.95454545 0.90909091 1.00000000
## 11 0.01374860 0.09090909 0.00000000 0.90909091
## 12 0.12897240 0.13636364 0.09090909 0.18181818
## 13 0.04285062 0.63636364 0.09090909 0.81818182
## 14 0.13184928 0.50000000 0.45454545 0.54545455
## 15 0.07653980 0.54545455 0.45454545 0.63636364
## 16 0.04945274 0.86363636 0.81818182 0.90909091
## 17 0.02480510 0.81818182 0.09090909 0.90909091
## 18 0.05297340 0.40909091 0.36363636 0.45454545
## 19 0.05297340 0.45454545 0.27272727 0.63636364
## 20 0.08838752 0.22727273 0.18181818 0.27272727
## 21 0.00000000 0.90909091 0.00000000 1.00000000
## 22 0.10535600 NA 0.45454545 0.54545455
## 23 0.03048592 NA 0.09090909 0.81818182
## 24 0.13184928 NA 0.45454545 0.54545455
## 25 0.08775739 NA 0.63636364 0.72727273
## [1] "0.520999 0.076539803 0.131849283"
## [1] "0.314804 0.0248051 0.04285062"
## [1] "0.605709 0.076539803 0.131849283"
## [1] "1 0.05337992 0.0877573938029508"
## [1] 0.05994581 0.08775739 0.01907005 0.09024684 0.12061890 0.01374860
## [7] 0.12897240 0.13184928 0.04945274 0.05297340 0.08838752
## [1] 0.003
## [1] "mse 0.000426880520661157"
t = plot_tree(paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.m5.treemix.out"))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 0 <NA> NOT_ROOT NOT_MIG NOT_TIP 16 211 1 75 1
## 2 1 V5 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 3 2 <NA> NOT_ROOT NOT_MIG NOT_TIP 76 3 1 1 1
## 4 3 V9 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 5 4 V6 NOT_ROOT NOT_MIG TIP 52 NA NA NA NA
## 6 15 V3 NOT_ROOT NOT_MIG TIP 172 NA NA NA NA
## 7 16 <NA> NOT_ROOT NOT_MIG NOT_TIP 76 172 4 0 2
## 8 31 V1 NOT_ROOT NOT_MIG TIP 104 NA NA NA NA
## 9 32 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 171 1 104 2
## 10 51 V7 NOT_ROOT NOT_MIG TIP 213 NA NA NA NA
## 11 52 <NA> NOT_ROOT NOT_MIG NOT_TIP 213 4 1 136 9
## 12 75 V8 NOT_ROOT NOT_MIG TIP 0 NA NA NA NA
## 13 76 <NA> NOT_ROOT NOT_MIG NOT_TIP 136 16 6 2 2
## 14 103 V4 NOT_ROOT NOT_MIG TIP 104 NA NA NA NA
## 15 104 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 31 1 103 1
## 16 135 V11 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 17 136 <NA> NOT_ROOT NOT_MIG NOT_TIP 52 135 1 76 8
## 18 171 V2 NOT_ROOT NOT_MIG TIP 32 NA NA NA NA
## 19 172 <NA> NOT_ROOT NOT_MIG NOT_TIP 16 15 1 32 3
## 20 211 V10 NOT_ROOT NOT_MIG TIP 0 NA NA NA NA
## 21 213 <NA> ROOT NOT_MIG NOT_TIP 213 52 10 51 1
## 22 263 <NA> NOT_ROOT MIG NOT_TIP 104 103 NA NA NA
## 23 216 <NA> NOT_ROOT MIG NOT_TIP 136 76 NA NA NA
## 24 289 <NA> NOT_ROOT MIG NOT_TIP 104 103 NA NA NA
## 25 336 <NA> NOT_ROOT MIG NOT_TIP 2 3 NA NA NA
## 26 370 <NA> NOT_ROOT MIG NOT_TIP 136 76 NA NA NA
## V11
## 1 (V10:0.0111015,V8:0.201352):0.0248959
## 2 V5:0.00645114
## 3 (V9:0.0516077,V5:0.00645114):0.0105381
## 4 V9:0.0516077
## 5 V6:0.00531978
## 6 V3:0.0634468
## 7 ((V3:0.0634468,(V2:0.000403751,(V1:0.0441021,V4:0.0553085):0.022421):0.000340551):0.00154311,(V10:0.0111015,V8:0.201352):0.0248959):0.00954277
## 8 V1:0.0441021
## 9 (V2:0.000403751,(V1:0.0441021,V4:0.0553085):0.022421):0.000340551
## 10 V7:0.0137477
## 11 (V6:0.00531978,(V11:0.0264164,(((V3:0.0634468,(V2:0.000403751,(V1:0.0441021,V4:0.0553085):0.022421):0.000340551):0.00154311,(V10:0.0111015,V8:0.201352):0.0248959):0.00954277,(V9:0.0516077,V5:0.00645114):0.0105381):0.0181715):0.0110111):0.0137477
## 12 V8:0.201352
## 13 (((V3:0.0634468,(V2:0.000403751,(V1:0.0441021,V4:0.0553085):0.022421):0.000340551):0.00154311,(V10:0.0111015,V8:0.201352):0.0248959):0.00954277,(V9:0.0516077,V5:0.00645114):0.0105381):0.0181715
## 14 V4:0.0553085
## 15 (V1:0.0441021,V4:0.0553085):0.022421
## 16 V11:0.0264164
## 17 (V11:0.0264164,(((V3:0.0634468,(V2:0.000403751,(V1:0.0441021,V4:0.0553085):0.022421):0.000340551):0.00154311,(V10:0.0111015,V8:0.201352):0.0248959):0.00954277,(V9:0.0516077,V5:0.00645114):0.0105381):0.0181715):0.0110111
## 18 V2:0.000403751
## 19 (V3:0.0634468,(V2:0.000403751,(V1:0.0441021,V4:0.0553085):0.022421):0.000340551):0.00154311
## 20 V10:0.0111015
## 21 ((V6:0.00531978,(V11:0.0264164,(((V3:0.0634468,(V2:0.000403751,(V1:0.0441021,V4:0.0553085):0.022421):0.000340551):0.00154311,(V10:0.0111015,V8:0.201352):0.0248959):0.00954277,(V9:0.0516077,V5:0.00645114):0.0105381):0.0181715):0.0110111):0.0137477,V7:0.0137477);
## 22 <NA>
## 23 <NA>
## 24 <NA>
## 25 <NA>
## 26 <NA>
## x y ymin ymax
## 1 0.07736893 0.36363636 0.27272727 0.45454545
## 2 0.05991950 0.13636364 0.09090909 0.18181818
## 3 0.05346836 0.18181818 0.09090909 0.27272727
## 4 0.08785954 0.22727273 0.18181818 0.27272727
## 5 0.01906748 0.95454545 0.90909091 1.00000000
## 6 0.09100414 0.77272727 0.72727273 0.81818182
## 7 0.05247303 0.45454545 0.27272727 0.81818182
## 8 0.12087979 0.59090909 0.54545455 0.63636364
## 9 0.05435669 0.63636364 0.45454545 0.72727273
## 10 0.01374770 0.04545455 0.00000000 0.09090909
## 11 0.01374770 0.90909091 0.09090909 1.00000000
## 12 0.12915199 0.31818182 0.27272727 0.36363636
## 13 0.04293026 0.27272727 0.09090909 0.81818182
## 14 0.13208616 0.50000000 0.45454545 0.54545455
## 15 0.07677769 0.54545455 0.45454545 0.63636364
## 16 0.04944041 0.86363636 0.81818182 0.90909091
## 17 0.02475880 0.81818182 0.09090909 0.90909091
## 18 0.05469824 0.68181818 0.63636364 0.72727273
## 19 0.05401614 0.72727273 0.45454545 0.81818182
## 20 0.08847043 0.40909091 0.36363636 0.45454545
## 21 0.00000000 0.09090909 0.00000000 1.00000000
## 22 0.10434659 NA 0.45454545 0.54545455
## 23 0.03040578 NA 0.09090909 0.81818182
## 24 0.13208616 NA 0.45454545 0.54545455
## 25 0.08785954 NA 0.09090909 0.18181818
## 26 0.04293026 NA 0.09090909 0.81818182
## [1] "0.498457 0.076777691 0.132086161"
## [1] "0.310761 0.0247588 0.04293026"
## [1] "0.67518 0.076777691 0.132086161"
## [1] "1 0.05346836 0.0878595404120696"
## [1] "0.619417 0.0247588 0.04293026"
## [1] 0.05991950 0.08785954 0.01906748 0.09100414 0.12087979 0.01374770
## [7] 0.12915199 0.13208616 0.04944041 0.05469824 0.08847043
## [1] 0.003
## [1] "mse 0.000426880520661157"
t = plot_tree(paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.m6.treemix.out"))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 0 <NA> NOT_ROOT NOT_MIG NOT_TIP 16 211 1 75 1
## 2 1 V5 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 3 2 <NA> NOT_ROOT NOT_MIG NOT_TIP 76 3 1 1 1
## 4 3 V9 NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 5 4 V6 NOT_ROOT NOT_MIG TIP 52 NA NA NA NA
## 6 15 V3 NOT_ROOT NOT_MIG TIP 32 NA NA NA NA
## 7 16 <NA> NOT_ROOT NOT_MIG NOT_TIP 76 172 4 0 2
## 8 31 V1 NOT_ROOT NOT_MIG TIP 104 NA NA NA NA
## 9 32 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 171 1 15 1
## 10 51 V7 NOT_ROOT NOT_MIG TIP 213 NA NA NA NA
## 11 52 <NA> NOT_ROOT NOT_MIG NOT_TIP 213 136 9 4 1
## 12 75 V8 NOT_ROOT NOT_MIG TIP 0 NA NA NA NA
## 13 76 <NA> NOT_ROOT NOT_MIG NOT_TIP 136 16 6 2 2
## 14 103 V4 NOT_ROOT NOT_MIG TIP 104 NA NA NA NA
## 15 104 <NA> NOT_ROOT NOT_MIG NOT_TIP 172 103 1 31 1
## 16 135 V11 NOT_ROOT NOT_MIG TIP 136 NA NA NA NA
## 17 136 <NA> NOT_ROOT NOT_MIG NOT_TIP 52 135 1 76 8
## 18 171 V2 NOT_ROOT NOT_MIG TIP 32 NA NA NA NA
## 19 172 <NA> NOT_ROOT NOT_MIG NOT_TIP 16 104 2 32 2
## 20 211 V10 NOT_ROOT NOT_MIG TIP 0 NA NA NA NA
## 21 213 <NA> ROOT NOT_MIG NOT_TIP 213 51 1 52 10
## 22 263 <NA> NOT_ROOT MIG NOT_TIP 104 103 NA NA NA
## 23 216 <NA> NOT_ROOT MIG NOT_TIP 136 76 NA NA NA
## 24 291 <NA> NOT_ROOT MIG NOT_TIP 104 103 NA NA NA
## 25 336 <NA> NOT_ROOT MIG NOT_TIP 2 3 NA NA NA
## 26 370 <NA> NOT_ROOT MIG NOT_TIP 136 76 NA NA NA
## 27 501 <NA> NOT_ROOT MIG NOT_TIP 32 15 NA NA NA
## V11
## 1 (V10:0.0108221,V8:0.205125):0.025204
## 2 V5:0.0049158
## 3 (V9:0.0641069,V5:0.0049158):0.0120451
## 4 V9:0.0641069
## 5 V6:0.00531979
## 6 V3:0.0604399
## 7 (((V4:0.0552339,V1:0.0438529):0.0224576,(V2:0.000582587,V3:0.0604399):2.82921e-05):0.00182254,(V10:0.0108221,V8:0.205125):0.025204):0.00976326
## 8 V1:0.0438529
## 9 (V2:0.000582587,V3:0.0604399):2.82921e-05
## 10 V7:0.0137477
## 11 ((V11:0.0266826,((((V4:0.0552339,V1:0.0438529):0.0224576,(V2:0.000582587,V3:0.0604399):2.82921e-05):0.00182254,(V10:0.0108221,V8:0.205125):0.025204):0.00976326,(V9:0.0641069,V5:0.0049158):0.0120451):0.0180575):0.0108971,V6:0.00531979):0.0137477
## 12 V8:0.205125
## 13 ((((V4:0.0552339,V1:0.0438529):0.0224576,(V2:0.000582587,V3:0.0604399):2.82921e-05):0.00182254,(V10:0.0108221,V8:0.205125):0.025204):0.00976326,(V9:0.0641069,V5:0.0049158):0.0120451):0.0180575
## 14 V4:0.0552339
## 15 (V4:0.0552339,V1:0.0438529):0.0224576
## 16 V11:0.0266826
## 17 (V11:0.0266826,((((V4:0.0552339,V1:0.0438529):0.0224576,(V2:0.000582587,V3:0.0604399):2.82921e-05):0.00182254,(V10:0.0108221,V8:0.205125):0.025204):0.00976326,(V9:0.0641069,V5:0.0049158):0.0120451):0.0180575):0.0108971
## 18 V2:0.000582587
## 19 ((V4:0.0552339,V1:0.0438529):0.0224576,(V2:0.000582587,V3:0.0604399):2.82921e-05):0.00182254
## 20 V10:0.0108221
## 21 (V7:0.0137477,((V11:0.0266826,((((V4:0.0552339,V1:0.0438529):0.0224576,(V2:0.000582587,V3:0.0604399):2.82921e-05):0.00182254,(V10:0.0108221,V8:0.205125):0.025204):0.00976326,(V9:0.0641069,V5:0.0049158):0.0120451):0.0180575):0.0108971,V6:0.00531979):0.0137477);
## 22 <NA>
## 23 <NA>
## 24 <NA>
## 25 <NA>
## 26 <NA>
## 27 <NA>
## x y ymin ymax
## 1 0.07766955 0.36363636 0.27272727 0.45454545
## 2 0.05966319 0.13636364 0.09090909 0.18181818
## 3 0.05474739 0.18181818 0.09090909 0.27272727
## 4 0.09029900 0.22727273 0.18181818 0.27272727
## 5 0.01906749 0.04545455 0.00000000 0.09090909
## 6 0.09153321 0.50000000 0.45454545 0.54545455
## 7 0.05246555 0.45454545 0.27272727 0.81818182
## 8 0.12059859 0.68181818 0.63636364 0.72727273
## 9 0.05431638 0.54545455 0.45454545 0.63636364
## 10 0.01374770 0.95454545 0.90909091 1.00000000
## 11 0.01374770 0.09090909 0.00000000 0.90909091
## 12 0.12946881 0.31818182 0.27272727 0.36363636
## 13 0.04270229 0.27272727 0.09090909 0.81818182
## 14 0.13197962 0.77272727 0.72727273 0.81818182
## 15 0.07674569 0.72727273 0.63636364 0.81818182
## 16 0.04940283 0.86363636 0.81818182 0.90909091
## 17 0.02464480 0.81818182 0.09090909 0.90909091
## 18 0.05480875 0.59090909 0.54545455 0.63636364
## 19 0.05428809 0.63636364 0.45454545 0.81818182
## 20 0.08849165 0.40909091 0.36363636 0.45454545
## 21 0.00000000 0.90909091 0.00000000 1.00000000
## 22 0.11787899 NA 0.63636364 0.72727273
## 23 0.03050944 NA 0.09090909 0.81818182
## 24 0.13197962 NA 0.63636364 0.72727273
## 25 0.09029900 NA 0.09090909 0.18181818
## 26 0.04270229 NA 0.09090909 0.81818182
## 27 0.05999265 NA 0.45454545 0.54545455
## [1] "0.744711 0.07674569 0.13197962"
## [1] "0.324776 0.0246448 0.04270229"
## [1] "0.768226 0.07674569 0.13197962"
## [1] "1 0.05474739 0.0902990037952442"
## [1] "0.574053 0.0246448 0.04270229"
## [1] "0.152519 0.0543163821 0.09153320730176"
## [1] 0.05966319 0.09029900 0.01906749 0.09153321 0.12059859 0.01374770
## [7] 0.12946881 0.13197962 0.04940283 0.05480875 0.08849165
## [1] 0.003
## [1] "mse 0.000426880520661157"
Along with the treemix software, are distribution the software threepop and fourpop. These measure f3 and f4 statistics which test for treeness in population trees.
The three-population test is of the form f3(A;B;C), where a significantly negative value of the f3 statistic implies that population A is admixed. The output is four columns: populations | f3 statistic | standard error | Z-score
#threepop -i ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix.gz -k 500 | grep "^V" > ${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix.3pop.tab
threepop \
-i /Users/afeurtey/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp.high_anc_coef_snmf.treemix.gz \
-k 500 | \
grep "^V" > \
/Users/afeurtey/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp.high_anc_coef_snmf.treemix.3pop.tab
#${POPSTR}$VCFNAME.high_anc_coef_snmf.treemix.3pop.tab
temp = read_delim(paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.treemix.3pop.tab"), delim = " ",
col_names = c("populations", "f3 statistic", "standard error", "Z-score")) %>%
filter(`f3 statistic` < 0)
kable(temp)
| populations | f3 statistic | standard error | Z-score |
|---|---|---|---|
| V2;V1,V11 | -0.0015012 | 0.0008542 | -1.757460 |
| V2;V1,V5 | -0.0003266 | 0.0006020 | -0.542508 |
| V2;V1,V6 | -0.0007156 | 0.0007107 | -1.006930 |
| V2;V1,V7 | -0.0012880 | 0.0005334 | -2.414910 |
| V2;V1,V8 | -0.0010022 | 0.0007398 | -1.354720 |
| V2;V11,V3 | -0.0008395 | 0.0007929 | -1.058810 |
| V2;V11,V4 | -0.0007121 | 0.0009595 | -0.742185 |
| V2;V3,V5 | -0.0006051 | 0.0005058 | -1.196310 |
| V2;V3,V6 | -0.0010501 | 0.0008162 | -1.286560 |
| V2;V3,V7 | -0.0017748 | 0.0006643 | -2.671630 |
| V2;V4,V7 | -0.0005809 | 0.0006630 | -0.876153 |
| V2;V4,V8 | -0.0004811 | 0.0008289 | -0.580392 |
separate(temp, col = populations, into = c("Introgressed_pop", "Pop1", "Pop2"), remove = F) %>%
ggplot(aes(x = Pop1, y = Pop2, fill = `Z-score`)) +
geom_tile() +
theme_cowplot()
threshold_per_pop = 6
threshold_per_country = 20
#Subsample
temp = Zt_meta %>%
#filter(!(ID_file %in% filtered_samples$ID_file)) %>%
#unite(Country, Region, col = "Country2", remove = F) %>%
#mutate(Country_for_filter = ifelse(Country == "USA", Country2, Country)) %>%
mutate(x = round(Latitude, 1), y = round(Longitude, 1)) %>%
unite(Coord_for_filter, x, y, sep = ";", remove = F) %>%
group_by(Coord_for_filter) %>%
dplyr::mutate(Nb_per_coord = n()) %>%
group_by(Country) %>%
dplyr::mutate(Nb_per_country= n())
small_pop = temp %>%
filter(Nb_per_coord <= threshold_per_pop) %>%
filter(Nb_per_country <= threshold_per_country)
temp2 = bind_rows(temp %>%
filter(!(ID_file %in% small_pop$ID_file))%>%
filter(Nb_per_coord > threshold_per_pop) %>%
group_by(Coord_for_filter) %>%
dplyr::sample_n(threshold_per_pop),
temp %>%
filter(!(ID_file %in% small_pop$ID_file))%>%
filter(Nb_per_coord <= threshold_per_pop)) %>%
ungroup() %>%
group_by(Country) %>%
dplyr::mutate(Nb_per_country= n())
Zt_meta_even_sampling = bind_rows(temp2 %>%
filter(Nb_per_country > threshold_per_country) %>%
dplyr::sample_n(threshold_per_country),
temp2 %>%
filter(Nb_per_country <= threshold_per_country)) %>%
bind_rows(., small_pop) %>%
ungroup()
write_tsv(Zt_meta_even_sampling %>% dplyr::select(ID_file),
paste0(PopStr_dir, "List_samples_even_sampling.txt"),
col_names = F)
ggplot() +
geom_bar(data = temp %>% group_by(Continent, Country) %>% dplyr::count(),
aes(x= Country, y = n, fill = Continent), alpha = 0.4, stat = "identity") +
geom_bar(data = Zt_meta_even_sampling %>% group_by(Continent, Country) %>% dplyr::count(),
aes(x= Country, y = n, fill = Continent), alpha = 0.4, stat = "identity") +
Fill_Continent + theme_bw() +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
head ${POPSTR}List_samples_even_sampling.txt
vcftools --vcf ${VCFDIR}$VCFNAME.recode.vcf \
--keep ${POPSTR}List_samples_even_sampling.txt \
--remove-filtered-all --extract-FORMAT-info GT \
--max-missing 1.0 --min-alleles 2 --max-alleles 2 \
--maf 0.05 \
--out ${POPSTR}$VCFNAME.even_sampling
cat ${POPSTR}$VCFNAME.even_sampling.GT.FORMAT | cut -f 3- \
> ${POPSTR}$VCFNAME.even_sampling.GT.FORMAT2
cat ${POPSTR}$VCFNAME.even_sampling.GT.FORMAT | cut -f 1,2 \
> ${POPSTR}$VCFNAME.even_sampling.GT.FORMAT.pos
head -n1 ${POPSTR}$VCFNAME.even_sampling.GT.FORMAT2 | gsed "s/\t/\n/g" \
> ${POPSTR}$VCFNAME.even_sampling.ind
gsed "s/\t//g" ${POPSTR}$VCFNAME.even_sampling.GT.FORMAT2 | tail -n +2 \
> ${POPSTR}$VCFNAME.even_sampling.geno
datamash transpose < ${POPSTR}$VCFNAME.even_sampling.GT.FORMAT2 | sed 's/^/>/' | sed 's/\t/\n/' | sed 's/\t//g' | cut -c -150 > ${POPSTR}$VCFNAME.even_sampling.fasta
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/3_Sumstats_demography/Sample_list_V*_*_diversity_pi.tsv \
# ~/Documents/Postdoc_Bruce/Projects/WW_project/3_Sumstats_demography/
echo -e "Subset_samples\tChromosome\tStart\tStop\tPi\tTajimaD\tTheta" > \
../3_Sumstats_demography/Diversity_per_cluster.tsv
cat ../3_Sumstats_demography/Sample_list_V*_*_diversity_pi.tsv >> \
../3_Sumstats_demography/Diversity_per_cluster.tsv
ordering_table = tibble(Cluster = c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11"),
Order_tree = c(4, 10, 9, 6, 7, 5, 1, 3, 2, 11, 8))
diversity_values = read_tsv(paste0(Sumstats_dir, "Diversity_per_cluster.tsv"), na = "nan")
temp = diversity_values %>%
dplyr::select(-Theta) %>%
mutate(Subset_samples = str_remove(Subset_samples, "Sample_list_")) %>%
separate(Subset_samples, into = c("Cluster", "Rep")) %>%
group_by(Cluster, Chromosome, Start, Stop) %>%
dplyr::summarize(Pi = mean(Pi, na.rm = T), TajimaD = mean(TajimaD, na.rm = T)) %>%
pivot_longer(cols = c(Pi, TajimaD), names_to = "Estimate", values_to = "Value") %>%
left_join(., ordering_table) %>%
arrange(Order_tree)
median_values = ungroup(temp) %>%
group_by(Estimate) %>%
dplyr::summarize(average = median(Value, na.rm = T))
pi_data = temp %>%
filter(Estimate == "Pi") %>%
filter(!is.na(Cluster)) %>%
group_by(Cluster) %>%
dplyr::mutate(cluster_avg = median(Value, na.rm = T))
p1 = ggplot(pi_data, aes(x = reorder(Cluster, -Order_tree), y = Value, fill = Cluster)) +
geom_boxplot(outlier.shape = NA) +
#facet_wrap(vars(Estimate), scales = "free") +
coord_flip() + ylim(c(-0.001, 0.065)) +
labs(y = "Diversity (pi)") +
theme(legend.position = "None") +
geom_hline(yintercept = pull(median_values %>% filter(Estimate == "Pi") %>% dplyr::select(average)),
linetype = "dashed")
p2 = temp %>%
filter(Estimate == "TajimaD") %>%
ggplot(aes(x = reorder(Cluster, -Order_tree), y = Value, fill = Cluster)) +
geom_violin() +
geom_boxplot(width=.2, fill = "white", outlier.shape = NA) +
#facet_wrap(vars(Estimate), scales = "free") +
coord_flip() +
labs(y = "Tajima's D", x = "") +
theme_cowplot() +
theme(legend.position = "None") +
geom_hline(yintercept = pull(median_values %>% filter(Estimate == "TajimaD") %>% dplyr::select(average)),
linetype = "dashed")
cowplot::plot_grid(p1, p2, ncol = 2)
#One-way ANOVA with blocks
##Define linear model
model = lm(Value ~ Cluster,
data=pi_data)
summary(model) ### Will show overall p-value and r-squared
##
## Call:
## lm(formula = Value ~ Cluster, data = pi_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.016725 -0.008952 -0.004141 0.005035 0.114297
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.027e-02 7.997e-05 128.458 < 2e-16 ***
## ClusterV10 5.996e-03 1.131e-04 53.023 < 2e-16 ***
## ClusterV11 -5.197e-03 1.131e-04 -45.952 < 2e-16 ***
## ClusterV2 -8.199e-04 1.131e-04 -7.250 4.17e-13 ***
## ClusterV3 3.654e-03 1.131e-04 32.312 < 2e-16 ***
## ClusterV4 -1.250e-04 1.131e-04 -1.105 0.269
## ClusterV5 -3.679e-03 1.131e-04 -32.533 < 2e-16 ***
## ClusterV6 5.228e-03 1.131e-04 46.231 < 2e-16 ***
## ClusterV7 3.635e-03 1.131e-04 32.141 < 2e-16 ***
## ClusterV8 2.284e-03 1.131e-04 20.200 < 2e-16 ***
## ClusterV9 6.452e-03 1.131e-04 57.055 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01314 on 297099 degrees of freedom
## Multiple R-squared: 0.07332, Adjusted R-squared: 0.07329
## F-statistic: 2351 on 10 and 297099 DF, p-value: < 2.2e-16
##Conduct analysis of variance
Anova(model,type = "II")
## Anova Table (Type II tests)
##
## Response: Value
## Sum Sq Df F value Pr(>F)
## Cluster 4.060 10 2350.7 < 2.2e-16 ***
## Residuals 51.315 297099
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model)
##
## Call:
## lm(formula = Value ~ Cluster, data = pi_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.016725 -0.008952 -0.004141 0.005035 0.114297
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.027e-02 7.997e-05 128.458 < 2e-16 ***
## ClusterV10 5.996e-03 1.131e-04 53.023 < 2e-16 ***
## ClusterV11 -5.197e-03 1.131e-04 -45.952 < 2e-16 ***
## ClusterV2 -8.199e-04 1.131e-04 -7.250 4.17e-13 ***
## ClusterV3 3.654e-03 1.131e-04 32.312 < 2e-16 ***
## ClusterV4 -1.250e-04 1.131e-04 -1.105 0.269
## ClusterV5 -3.679e-03 1.131e-04 -32.533 < 2e-16 ***
## ClusterV6 5.228e-03 1.131e-04 46.231 < 2e-16 ***
## ClusterV7 3.635e-03 1.131e-04 32.141 < 2e-16 ***
## ClusterV8 2.284e-03 1.131e-04 20.200 < 2e-16 ***
## ClusterV9 6.452e-03 1.131e-04 57.055 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01314 on 297099 degrees of freedom
## Multiple R-squared: 0.07332, Adjusted R-squared: 0.07329
## F-statistic: 2351 on 10 and 297099 DF, p-value: < 2.2e-16
hist(residuals(model), col="darkgray")
#Post-hoc analysis: mean separation tests
marginal = lsmeans(model, ~ Cluster)
pairs(marginal, adjust="tukey")
## contrast estimate SE df t.ratio p.value
## V1 - V10 -6.00e-03 0.000113 297099 -53.023 <.0001
## V1 - V11 5.20e-03 0.000113 297099 45.952 <.0001
## V1 - V2 8.20e-04 0.000113 297099 7.250 <.0001
## V1 - V3 -3.65e-03 0.000113 297099 -32.312 <.0001
## V1 - V4 1.25e-04 0.000113 297099 1.105 0.9907
## V1 - V5 3.68e-03 0.000113 297099 32.533 <.0001
## V1 - V6 -5.23e-03 0.000113 297099 -46.231 <.0001
## V1 - V7 -3.63e-03 0.000113 297099 -32.141 <.0001
## V1 - V8 -2.28e-03 0.000113 297099 -20.200 <.0001
## V1 - V9 -6.45e-03 0.000113 297099 -57.055 <.0001
## V10 - V11 1.12e-02 0.000113 297099 98.975 <.0001
## V10 - V2 6.82e-03 0.000113 297099 60.273 <.0001
## V10 - V3 2.34e-03 0.000113 297099 20.711 <.0001
## V10 - V4 6.12e-03 0.000113 297099 54.128 <.0001
## V10 - V5 9.68e-03 0.000113 297099 85.556 <.0001
## V10 - V6 7.68e-04 0.000113 297099 6.793 <.0001
## V10 - V7 2.36e-03 0.000113 297099 20.882 <.0001
## V10 - V8 3.71e-03 0.000113 297099 32.824 <.0001
## V10 - V9 -4.56e-04 0.000113 297099 -4.032 0.0027
## V11 - V2 -4.38e-03 0.000113 297099 -38.702 <.0001
## V11 - V3 -8.85e-03 0.000113 297099 -78.264 <.0001
## V11 - V4 -5.07e-03 0.000113 297099 -44.847 <.0001
## V11 - V5 -1.52e-03 0.000113 297099 -13.419 <.0001
## V11 - V6 -1.04e-02 0.000113 297099 -92.183 <.0001
## V11 - V7 -8.83e-03 0.000113 297099 -78.093 <.0001
## V11 - V8 -7.48e-03 0.000113 297099 -66.152 <.0001
## V11 - V9 -1.16e-02 0.000113 297099 -103.007 <.0001
## V2 - V3 -4.47e-03 0.000113 297099 -39.562 <.0001
## V2 - V4 -6.95e-04 0.000113 297099 -6.145 <.0001
## V2 - V5 2.86e-03 0.000113 297099 25.282 <.0001
## V2 - V6 -6.05e-03 0.000113 297099 -53.481 <.0001
## V2 - V7 -4.45e-03 0.000113 297099 -39.391 <.0001
## V2 - V8 -3.10e-03 0.000113 297099 -27.450 <.0001
## V2 - V9 -7.27e-03 0.000113 297099 -64.305 <.0001
## V3 - V4 3.78e-03 0.000113 297099 33.417 <.0001
## V3 - V5 7.33e-03 0.000113 297099 64.844 <.0001
## V3 - V6 -1.57e-03 0.000113 297099 -13.919 <.0001
## V3 - V7 1.93e-05 0.000113 297099 0.171 1.0000
## V3 - V8 1.37e-03 0.000113 297099 12.112 <.0001
## V3 - V9 -2.80e-03 0.000113 297099 -24.743 <.0001
## V4 - V5 3.55e-03 0.000113 297099 31.428 <.0001
## V4 - V6 -5.35e-03 0.000113 297099 -47.335 <.0001
## V4 - V7 -3.76e-03 0.000113 297099 -33.246 <.0001
## V4 - V8 -2.41e-03 0.000113 297099 -21.305 <.0001
## V4 - V9 -6.58e-03 0.000113 297099 -58.160 <.0001
## V5 - V6 -8.91e-03 0.000113 297099 -78.763 <.0001
## V5 - V7 -7.31e-03 0.000113 297099 -64.674 <.0001
## V5 - V8 -5.96e-03 0.000113 297099 -52.732 <.0001
## V5 - V9 -1.01e-02 0.000113 297099 -89.587 <.0001
## V6 - V7 1.59e-03 0.000113 297099 14.090 <.0001
## V6 - V8 2.94e-03 0.000113 297099 26.031 <.0001
## V6 - V9 -1.22e-03 0.000113 297099 -10.824 <.0001
## V7 - V8 1.35e-03 0.000113 297099 11.941 <.0001
## V7 - V9 -2.82e-03 0.000113 297099 -24.914 <.0001
## V8 - V9 -4.17e-03 0.000113 297099 -36.855 <.0001
##
## P value adjustment: tukey method for comparing a family of 11 estimates
CLD = cld(marginal,
alpha = 0.05,
Letters = letters, ### Use lower-case letters for .group
adjust = "tukey") ### Tukey-adjusted p-values
CLD
## Cluster lsmean SE df lower.CL upper.CL .group
## V11 0.00508 8e-05 297099 0.00485 0.00530 a
## V5 0.00659 8e-05 297099 0.00637 0.00682 b
## V2 0.00945 8e-05 297099 0.00923 0.00968 c
## V4 0.01015 8e-05 297099 0.00992 0.01037 d
## V1 0.01027 8e-05 297099 0.01005 0.01050 d
## V8 0.01256 8e-05 297099 0.01233 0.01278 e
## V7 0.01391 8e-05 297099 0.01368 0.01413 f
## V3 0.01393 8e-05 297099 0.01370 0.01415 f
## V6 0.01550 8e-05 297099 0.01527 0.01573 g
## V10 0.01627 8e-05 297099 0.01604 0.01650 h
## V9 0.01672 8e-05 297099 0.01650 0.01695 i
##
## Confidence level used: 0.95
## Conf-level adjustment: sidak method for 11 estimates
## P value adjustment: tukey method for comparing a family of 11 estimates
## significance level used: alpha = 0.05
## NOTE: Compact letter displays can be misleading
## because they show NON-findings rather than findings.
## Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.
CLD$.group=gsub(" ", "", CLD$.group)
### Plot
pi_ave = pull(median_values %>% filter(Estimate == "Pi") %>% dplyr::select(average))
p1 = ggplot(pi_data, aes(x = reorder(Cluster, -Order_tree), y = Value)) +
coord_flip() +
geom_boxplot(outlier.shape = NA, alpha = .4, color = "grey") +
geom_segment(aes(x = Cluster, xend = Cluster,
y = pi_ave, yend = cluster_avg, color = Cluster), size = 0.8) +
#geom_jitter(size = 1.5, alpha = 0.2, width = 0.2) +
geom_hline(aes(yintercept = pi_ave), color = "gray20", size = 0.6) +
geom_point(aes(color = Cluster, x = Cluster, y = cluster_avg), size = 3) +
#stat_summary(aes(color = Cluster), fun = mean, geom = "point", size = 2) +
theme_cowplot() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 40, hjust = 1)) +
labs (x = "", y = "Diversity (pi)") +
ylim(c(-0.001, 0.055)) +
geom_text(data = CLD, aes(x = Cluster, label = .group, y = 0.054), color = "black")
cowplot::plot_grid(p1, p2, ncol = 2)
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/2_Population_structure/Sample_list_V*.ann.tsv ~/Documents/Postdoc_Bruce/Projects/WW_project/3_Sumstats_demography/
list_snpEff = list()
for (K in c(1:chosen_K)){
#for (K in c(1:3)){
print(K)
snpEff_per_cluster = read_tsv(paste0(Sumstats_dir, "Sample_list_V", K, ".ann.tsv"), col_names = c("CHR", "BP", "REF", "ALT", "ANN"))%>%
separate(ANN, sep = ",",
into = c("ANN1", "ANN2", "ANN3", "ANN4", "ANN5", "ANN6", "ANN7", "ANN8", "ANN9", "ANN10", "ANN11",
"ANN12", "ANN13", "ANN14", "ANN15", "ANN16", "ANN17"),
extra = "warn") %>%
pivot_longer(cols = c(-CHR, -BP, -REF, -ALT), names_to = "ANN", values_to = "Annotation") %>%
filter(!is.na(Annotation)) %>%
separate(Annotation, into = c("ALT2", "Variant_type", "Effect_strength", "Gene", "Rest"), sep = "\\|", extra = "merge")%>%
dplyr::mutate(Ranked_effect_strength = Effect_strength)
snpEff_per_cluster$Ranked_effect_strength[snpEff_per_cluster$Ranked_effect_strength == "HIGH"] = 1
snpEff_per_cluster$Ranked_effect_strength[snpEff_per_cluster$Ranked_effect_strength == "MODERATE"] = 2
snpEff_per_cluster$Ranked_effect_strength[snpEff_per_cluster$Ranked_effect_strength == "LOW"] = 3
snpEff_per_cluster$Ranked_effect_strength[snpEff_per_cluster$Ranked_effect_strength == "MODIFIER"] = 4
snpEff_per_cluster$Ranked_effect_strength[is.na(snpEff_per_cluster$Effect_strength)] = 5
snpEff_per_cluster$Effect_strength[is.na(snpEff_per_cluster$Effect_strength)] = "NO_EFFECT"
snpEff_per_cluster = snpEff_per_cluster %>%
group_by(CHR, BP) %>%
dplyr::mutate(ranktemp = rank(Ranked_effect_strength, ties.method = "random")) %>%
filter(ranktemp == 1)
list_snpEff[[K]] = ungroup(snpEff_per_cluster) %>%
dplyr::count(Effect_strength) %>%
dplyr::mutate(Cluster = K)
}
summary_snpEff_per_cluster = bind_rows(list_snpEff)
write_tsv(summary_snpEff_per_cluster, paste0(Sumstats_dir, "Summary_snpEff_per_cluster.tsv"))
summary_snpEff_per_cluster = read_tsv(paste0(Sumstats_dir, "Summary_snpEff_per_cluster.tsv"))
summary_snpEff_per_cluster %>%
ggplot(aes(x = as.factor(Cluster), y = n, fill =Effect_strength)) +
geom_bar(stat = "identity") +
theme_bw()
summary_snpEff_per_cluster %>%
group_by(Cluster) %>%
dplyr::mutate(total = sum(n),
prop = n/total) %>%
ggplot(aes(x = Effect_strength, y = prop, fill = as.factor(Cluster))) +
geom_bar(stat = "identity", position = "dodge") +
theme_bw()
summary_snpEff_per_cluster %>%
group_by(Cluster) %>%
dplyr::mutate(total = sum(n),
prop = n/total) %>%
filter(Effect_strength != "MODIFIER") %>%
ggplot(aes(x = as.factor(Cluster), y = prop, fill = as.factor(Effect_strength))) +
geom_bar(stat = "identity", position = "dodge") +
theme_bw()
summary_snpEff_per_cluster %>%
group_by(Cluster) %>%
dplyr::mutate(total = sum(n)) %>%
filter(Effect_strength != "MODIFIER") %>%
filter(Effect_strength != "LOW") %>%
mutate(prop = n/total) %>%
ggplot(aes(x = as.factor(Cluster), y = prop, fill = as.factor(Effect_strength))) +
geom_bar(stat = "identity") +
theme_bw()
#for i in {1..11} ; do for j in {1..11} ; do python Divergence_with_scikit-allel_Fst.py --vcf_file ../1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp.recode.vcf --sample_list1 ../2_Population_structure/Sample_list_V${i}.args --sample_list2 ../2_Population_structure/Sample_list_V${j}.args --out_dir ../2_Population_structure/Divergence/ --no_header ; done ; done
# echo -e "Subset1\tSubset2\tHudsons_Fst\tWeir_Cockerham_Fst" > \
# ../2_Population_structure/Divergence/Divergence.Fst_between_clusters.tsv
# cat ../2_Population_structure/Divergence/Divergence.Fst.Sample_list_V*_vs_Sample_list_V*.tsv >> \
# ../2_Population_structure/Divergence/Divergence.Fst_between_clusters.tsv
ordering_table = tibble(Clusters = c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11"),
Order_tree = c(4, 10, 9, 6, 7, 5, 1, 3, 2, 11, 8))
Fst_values = read_tsv(paste0(PopStr_dir, "Divergence.Fst_between_clusters.tsv"))
temp = Fst_values %>% dplyr::select(-Hudsons_Fst) %>%
mutate(Weir_Cockerham_Fst = ifelse(Weir_Cockerham_Fst < 0, 0, Weir_Cockerham_Fst),
Subset1 = str_remove(Subset1, "Sample_list_"),
Subset2 = str_remove(Subset2, "Sample_list_")) %>%
left_join(., ordering_table, by = c("Subset1" = "Clusters")) %>%
left_join(., ordering_table, by = c("Subset2" = "Clusters")) %>%
arrange(Order_tree.x, Order_tree.y) %>%
dplyr::select(-Order_tree.x, -Order_tree.y) %>%
pivot_wider(names_from = Subset2, values_from = Weir_Cockerham_Fst)
WC_Fst_values <- as.matrix(temp[,-1])
rownames(WC_Fst_values) <- temp[,1] %>% pull()
corrplot(WC_Fst_values, method="circle", is.corr=FALSE,
tl.col = "black", tl.srt=45, tl.cex = 1)
#type="upper")
library(geodist)
library(ggExtra)
#Estimate geographic distances between samples
temp = dplyr::select(Zt_meta, ID_file, Latitude, Longitude) %>%
filter(!is.na(Latitude))
geo_distances = as.data.frame(geodist(x = temp, sequential = FALSE, measure = "geodesic"))
colnames(geo_distances) <- temp$ID_file
geo_distances$ID_file1 = temp$ID_file
geo_distances = as.tibble(geo_distances) %>%
pivot_longer(-ID_file1, names_to = "ID_file2", values_to = "Geo_distance")
#Run IBS with plink on cluster and import
# ${SOFTPATH}plink --distance square ibs --vcf ${vcf_dir}${VCFBasename}.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.vcf.gz --out ${vcf_dir}${VCFBasename}.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80 --const-fid --not-chr 14-21 mt
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.mibs.id ~/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/0_Nuclear_genome/All_good_samples.mibs.id
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.mibs ~/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/0_Nuclear_genome/All_good_samples.mibs
#../Software/vcftools_jydu/src/cpp/vcftools --gzvcf ${vcf_dir}${VCFBasename}.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.vcf.gz --missing-indv --out temp
#rsync -avP alice@130.125.25.244:/home/alice/WW_PopGen/temp.imiss ~/Documents/Postdoc_Bruce/Projects/WW_project/2_Population_structure/0_Nuclear_genome/All_good_samples.miss
#Read genetic distances
ibs = read_tsv(paste0(nuc_PS_dir, "All_good_samples.mibs.id"), col_names = c("Whatever", "ID_file1")) %>%
dplyr::select(ID_file1) %>% pull()
related = read_tsv(paste0(nuc_PS_dir, "All_good_samples.mibs"), col_names = ibs) %>%
mutate(ID_file1 = ibs) %>%
pivot_longer(names_to = "ID_file2", values_to = "Relatedness", cols = -ID_file1)
distances = inner_join(related, geo_distances)
#To do: compare intra-continent and inter-continent
#To do: compare intra-cluster and inter-cluster
#
distances = distances %>%
inner_join(Zt_meta %>% dplyr::select(ID_file1 = ID_file, Continent1 = Continent)) %>%
inner_join(Zt_meta %>% dplyr::select(ID_file2 = ID_file, Continent2 = Continent)) %>%
mutate(Continent_comparison = ifelse(Continent1 == Continent2, "Intra", "Inter")) %>%
filter(Continent1 != "Asia") %>%
filter(ID_file1 != ID_file2)
#Dot plots
distances %>%
filter(Continent_comparison == "Intra") %>%
ggplot(aes(x = Geo_distance, y = Relatedness, col = Continent1)) +
geom_point(shape = 1, alpha = 0.6) +
theme_bw() +
facet_wrap(vars(Continent1), scales = "free") +
Color_Continent +
stat_smooth(col = "grey20", method = "lm")#, formula = "y ~ log(x)")
# Discretized distances
temp = distances %>%
filter(Continent_comparison == "Intra") %>%
filter(Geo_distance < 4000000) %>%
mutate(Geo_distance_disc = as.factor(500*round(Geo_distance/500000))) %>%
dplyr::count(Continent1, Geo_distance_disc) #%>%
#filter(n >= 100)
distances %>%
filter(Continent_comparison == "Intra") %>%
mutate(Geo_distance_disc = as.factor(500*round(Geo_distance/500000)))%>%
# inner_join(temp) %>%
filter(Geo_distance < 4000000) %>%
ggplot(aes(x = Geo_distance_disc, y = Relatedness, col = Continent1)) +
geom_boxplot(outlier.shape = 1) +
theme_bw() +
facet_wrap(vars(Continent1), scales = "free_y") +
Color_Continent +
geom_text(data = temp, aes(x = Geo_distance_disc, y = 1, label = n),
col = "black", angle = 90, size = 4) +
theme(axis.text.x = element_text(angle = 45, hjust =1, vjust = 1))
temp = list()
for (i in c(1:chosen_K)){
temp[[i]] = read_tsv(paste0(PopStr_dir, "Sample_list_V", i, ".args"), col_names = "ID_file") %>%
mutate( Cluster = paste0("V", i)) }
list_pure_cluster = bind_rows(temp)
distances = distances %>%
inner_join(list_pure_cluster %>% dplyr::select(ID_file1 = ID_file, Cluster1 = Cluster)) %>%
inner_join(list_pure_cluster %>% dplyr::select(ID_file2 = ID_file, Cluster2 = Cluster)) %>%
mutate(Cluster_comparison = ifelse(Cluster1 == Cluster2, "Intra", "Inter"))
distances %>%
filter(Cluster_comparison == "Intra") %>%
# inner_join(temp) %>%
# filter(Geo_distance < 4000000) %>%
ggplot(aes(x = Geo_distance, y = Relatedness, col = Cluster1)) +
geom_point() +
theme_bw() +
facet_wrap(vars(Cluster1), scales = "free") +
theme(axis.text.x = element_text(angle = 45, hjust =1, vjust = 1))+
stat_smooth(col = "grey20", method = "lm")
# Discretized distances
distances %>%
filter(Cluster_comparison == "Intra") %>%
mutate(Geo_distance_disc = as.factor(500*round(Geo_distance/500000)))%>%
# inner_join(temp) %>%
# filter(Geo_distance < 4000000) %>%
ggplot(aes(x = Geo_distance_disc, y = Relatedness, col = Cluster1)) +
geom_boxplot(outlier.shape = 1) +
theme_bw() +
facet_wrap(vars(Cluster1), scales = "free_y") +
#geom_text(data = temp, aes(x = Geo_distance_disc, y = 1, label = n),
# col = "black", angle = 90, size = 4) +
theme(axis.text.x = element_text(angle = 45, hjust =1, vjust = 1))
distances %>%
filter(Cluster_comparison == "Intra") %>%
mutate(Geo_distance_disc = as.factor(100*round(Geo_distance/100000)))%>%
# inner_join(temp) %>%
filter(Geo_distance < 1000000) %>%
ggplot(aes(x = Geo_distance_disc, y = Relatedness, col = Cluster1)) +
geom_boxplot(outlier.shape = 1) +
theme_bw() +
facet_wrap(vars(Cluster1), scales = "free_y") +
#geom_text(data = temp, aes(x = Geo_distance_disc, y = 1, label = n),
# col = "black", angle = 90, size = 4) +
theme(axis.text.x = element_text(angle = 45, hjust =1, vjust = 1))
distances %>%
filter(Cluster_comparison == "Intra") %>%
mutate(Geo_distance_disc = as.factor(50*round(Geo_distance/50000)))%>%
# inner_join(temp) %>%
filter(Geo_distance < 500000) %>%
ggplot(aes(x = Geo_distance_disc, y = Relatedness, col = Cluster1)) +
geom_boxplot(outlier.shape = 1) +
theme_bw() +
facet_wrap(vars(Cluster1), scales = "free_y") +
#geom_text(data = temp, aes(x = Geo_distance_disc, y = 1, label = n),
# col = "black", angle = 90, size = 4) +
theme(axis.text.x = element_text(angle = 45, hjust =1, vjust = 1))
# Comparing fit of models in Europe
set.seed(123)
subset_distances = distances %>% filter(Continent1 == "Europe") %>%
filter(Continent_comparison == "Intra")
training.samples <- distances$Relatedness %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- subset_distances[training.samples, ]
test.data <- subset_distances[-training.samples, ]
# Build the model for linear regression
model <- lm(Relatedness ~ Geo_distance, data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
RMSE = RMSE(predictions, test.data$Relatedness),
R2 = R2(predictions, test.data$Relatedness)
)
## RMSE R2
## 1 0.002795046 0.03554424
# Build the model for log regression
model <- lm(Relatedness ~ log(Geo_distance + 1), data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
RMSE = RMSE(predictions, test.data$Relatedness),
R2 = R2(predictions, test.data$Relatedness)
)
## RMSE R2
## 1 0.002819238 0.01878363
# Build the model
model <- gam(Relatedness ~ s(Geo_distance), data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
RMSE = RMSE(predictions, test.data$Relatedness),
R2 = R2(predictions, test.data$Relatedness)
)
## RMSE R2
## 1 0.002748006 0.06772863
ggplot(train.data, aes(Geo_distance, Relatedness) ) +
geom_point() +
stat_smooth(method = gam, formula = y ~ s(x))
ggplot(train.data, aes(Geo_distance, Relatedness) ) +
geom_hex()
#
temp = distances %>% filter(Continent1 == "Europe") %>%
filter(Continent2 == "Europe") %>%
mutate(Relat_group = ifelse(Relatedness > 0.935, "High", ifelse(Relatedness < 0.925, "Low", "Medium"))) %>%
inner_join(Zt_meta %>% dplyr::select(ID_file1 = ID_file, Country1 = Country, Year1 = Sampling_Date)) %>%
inner_join(Zt_meta %>% dplyr::select(ID_file2 = ID_file, Country2 = Country, Year2 = Sampling_Date)) %>%
dplyr::select(-c("Continent1", "Continent2", "Continent_comparison", "Cluster1", "Cluster2"))
ungroup(temp) %>% group_by(Relat_group, Year1, Year2) %>%
dplyr::count() %>%
ggplot(aes(x = Year1, y = Year2, size = n, col = Relat_group))+
geom_abline(slope = 1, intercept = 0, col = "grey30") +
geom_point(alpha = .4) +
theme_bw() + facet_wrap(vars(Relat_group))
ungroup(temp) %>% group_by(Relat_group, Country1, Country2) %>%
dplyr::count() %>%
ggplot(aes(x = Country1, y = Country2, size = n, col = Relat_group))+
geom_abline(slope = 1, intercept = 0, col = "grey30") +
geom_point(alpha = .4) +
theme_bw() + facet_wrap(vars(Relat_group))
p1 = bind_rows(ungroup(temp) %>% dplyr::select(Relatedness, ID_file = ID_file1),
ungroup(temp) %>% dplyr::select(Relatedness, ID_file = ID_file2))%>%
group_by(ID_file) %>%
dplyr::summarize(median_rel = median(Relatedness), ave_rel = mean(Relatedness)) %>%
dplyr::mutate(median_rel_cat = ifelse(median_rel > 0.935, "High", ifelse(median_rel < 0.925, "Low", "Medium"))) %>%
full_join(read_tsv(paste0(nuc_PS_dir, "All_good_samples.miss")), by = c("ID_file" = "INDV"))
ggplot(data = p1, aes(median_rel, F_MISS)) + #TODO: compare with missing data...
geom_point(alpha = .4) +
geom_text(data = p1 %>% filter(median_rel_cat != "Medium"),
aes (median_rel, F_MISS, label = ID_file))
inner_join(p1, dplyr::select(Zt_meta, Latitude, Longitude, ID_file)) %>%
dplyr::filter(median_rel_cat != "Medium") %>%
ggplot(aes(x = Latitude, y = Longitude, col = median_rel_cat)) +
geom_point() +
theme_bw()
distances = distances %>%
inner_join(Zt_meta %>% dplyr::select(ID_file1 = ID_file, Year1 = Sampling_Date)) %>%
inner_join(Zt_meta %>% dplyr::select(ID_file2 = ID_file, Year2 = Sampling_Date)) %>%
filter(!is.na(Year1) & !is.na(Year2)) %>%
mutate(Time_diff = abs(Year1 - Year2))
samples_to_exclude = p1 %>% filter(median_rel_cat != "Medium") %>% pull(ID_file)
p2 = distances %>% filter(Continent1 == "Europe") %>%
filter(!(ID_file1 %in% samples_to_exclude)) %>%
filter(!(ID_file2 %in% samples_to_exclude)) %>%
filter(Continent_comparison == "Intra") %>%
ggplot(aes(Geo_distance, Relatedness, col = Time_diff) ) +
geom_point(alpha = .4) +
coord_cartesian(ylim = c(0.91, 0.975))
p3 = distances %>% filter(Continent1 == "Europe") %>%
filter(Continent_comparison == "Intra") %>%
ggplot(aes(Geo_distance, Relatedness, col = Time_diff) ) +
geom_point(alpha = .4) +
coord_cartesian(ylim = c(0.91, 0.975))
cowplot::plot_grid(p2, p3, ncol = 2)
# Model
subset_distances = distances %>% filter(Continent1 == "Europe") %>%
filter(Continent_comparison == "Intra") %>%
filter(!(ID_file1 %in% samples_to_exclude)) %>%
filter(!(ID_file2 %in% samples_to_exclude))
training.samples <- subset_distances$Relatedness %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- subset_distances[training.samples, ]
test.data <- subset_distances[-training.samples, ]
# Build the model for linear regression
model <- lm(Relatedness ~ Geo_distance, data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
RMSE = RMSE(predictions, test.data$Relatedness),
R2 = R2(predictions, test.data$Relatedness)
)
## RMSE R2
## 1 0.001056302 0.01144103
library(mgcv)
# Build the model
model <- gam(Relatedness ~ s(Geo_distance), data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
RMSE = RMSE(predictions, test.data$Relatedness),
R2 = R2(predictions, test.data$Relatedness)
)
## RMSE R2
## 1 0.001033025 0.054408
ggplot(train.data, aes(Geo_distance, Relatedness) ) +
geom_point(alpha = .2) +
stat_smooth(method = gam, formula = y ~ s(x)) +
coord_cartesian(ylim = c(0.92, 0.94)) +
theme_bw()
# Get area for country, wheat culture and percentage of land dedicated to wheat
temp = read_csv(paste0(metadata_dir, "FAOSTAT_data_4-15-2021_land_stats.csv"),
col_types = "ccdcdcdcddcdc")
wheat_per_country = read_csv(paste0(metadata_dir, "FAOSTAT_data_4-9-2021_crops_production.csv")) %>%
filter(Unit == "ha") %>%
dplyr::select(Area, `Year Code`, Value) %>%
group_by(Area) %>%
summarize(Wheat_ha = mean(Value)) %>%
inner_join(., filter(temp, Item == "Arable land") %>%
mutate(Arable_land_area_in_ha = Value * 1000) %>%
dplyr::select(Area, Arable_land_area_in_ha)) %>%
mutate(Arable_land_for_wheat = Wheat_ha * 100 / Arable_land_area_in_ha) %>%
inner_join(., filter(temp, Item == "Land area") %>%
mutate(Land_area_in_ha = Value * 1000) %>%
dplyr::select(Area, Land_area_in_ha)) %>%
mutate(Land_for_wheat = Wheat_ha * 100 / Land_area_in_ha)
wheat_per_state = read_tsv(paste0(metadata_dir, "USDA_crop_production_2017-2019.txt")) %>%
mutate(Land_area_in_ha = `Land area` * 1000) %>%
dplyr::select(Area, Year, Wheat_ha, Land_area_in_ha) %>%
group_by(Area) %>%
summarize(Wheat_ha = mean(Wheat_ha), Land_area_in_ha = mean(Land_area_in_ha)) %>%
mutate(Land_for_wheat = Wheat_ha * 100 / Land_area_in_ha)
wheat_per_country = bind_rows(wheat_per_country, wheat_per_state)
#Normalize country names
correspondence_table = readxl::read_excel(path = paste0(metadata_dir, "FAO_stats_map_corres.xlsx"), sheet = "Summary")
for (i in c(1:nrow(correspondence_table))) {
print(pull(correspondence_table[i, 2]))
print(pull(correspondence_table[i, 1]))
wheat_per_country = wheat_per_country %>%
mutate_at(vars(contains('Area')),
funs(str_replace(., pull(correspondence_table[i, 2]), pull(correspondence_table[i, 1]))))
}
## [1] "Bolivia (Plurinational State of)"
## [1] "Bolivia"
## [1] "Brunei Darussalam"
## [1] "Brunei"
## [1] "Czechia"
## [1] "Czech Republic"
## [1] "French Guyana"
## [1] "French Guiana"
## [1] "Iran (Islamic Republic of)"
## [1] "Iran"
## [1] "Côte d'Ivoire"
## [1] "Ivory Coast"
## [1] "North Macedonia"
## [1] "Macedonia"
## [1] "Melanesia"
## [1] "Mayotte"
## [1] "Republic of Moldova"
## [1] "Moldova"
## [1] "Democratic People's Republic of Korea"
## [1] "North Korea"
## [1] "Congo"
## [1] "Republic of Congo"
## [1] "Réunion"
## [1] "Reunion"
## [1] "Russian Federation"
## [1] "Russia"
## [1] "Saint Kitts and Nevis"
## [1] "Saint Kitts"
## [1] "Saint Vincent and the Grenadines"
## [1] "Saint Vincent"
## [1] "Republic of Korea"
## [1] "South Korea"
## [1] "Syrian Arab Republic"
## [1] "Syria"
## [1] "United Republic of Tanzania"
## [1] "Tanzania"
## [1] "Trinidad and Tobago"
## [1] "Trinidad"
## [1] "United Kingdom of Great Britain and Northern Ireland"
## [1] "UK"
## [1] "United States of America"
## [1] "USA"
## [1] "Venezuela (Bolivarian Republic of)"
## [1] "Venezuela"
#Maps
world2 = left_join(world, wheat_per_country, by = c("region" = "Area"))
temp = Zt_meta %>%
dplyr::count(Country, Latitude, Longitude, name = "Number_genomes") %>%
filter(Number_genomes > 0)
ggplot() + theme_void() +
geom_polygon(data = world2, aes(x=long, y = lat, group = group, fill = Land_for_wheat), alpha=0.7) +
scale_size("Number of genomes", limits = c(1, max_circle)) +
scale_fill_gradient(low = "#FEEDD8", high = "goldenrod", na.value="#F7F4F3")
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/3_Sumstats_demography/Sample_list_V*_*_diversity_pi.tsv \
# ~/Documents/Postdoc_Bruce/Projects/WW_project/3_Sumstats_demography/
echo -e "Subset_samples\tChromosome\tStart\tStop\tPi\tTajimaD\tTheta" > \
../3_Sumstats_demography/Diversity_per_country.tsv
for x in Algeria Argentina Australia Australia_NSW Australia_Tasmania Belgium Canada Chile Czech_Republic Denmark France Germany Iran Ireland Israel Netherlands New_Zealand Switzerland Tunisia Turkey UK USA_Indiana USA_Missouri USA_Oregon USA_Texas Uruguay;
do
cat ../3_Sumstats_demography/Sample_list_${x}_diversity_pi.tsv >> \
../3_Sumstats_demography/Diversity_per_country.tsv
done
#Setting countries and windows
temp_countries = c("Algeria", "Argentina", "Australia", "Belgium", "Canada", "Chile", "Czech_Republic",
"Denmark", "France", "Germany", "Iran", "Ireland", "Israel", "Netherlands",
"New_Zealand", "Switzerland", "Tunisia", "Turkey", "UK",
"USA_Indiana", "USA_Missouri", "USA_Oregon", "USA_Texas", "Uruguay")
diversity_values = read_tsv(paste0(Sumstats_dir, "Diversity_per_country.tsv"), na = "nan")
temp = diversity_values %>% dplyr::select(-Theta, -TajimaD) %>%
mutate(Country = str_remove(Subset_samples, "Sample_list_")) %>%
group_by(Country) %>%
summarize(Pi = median(Pi, na.rm = T))
temp = countries %>%
mutate(Country = ifelse(Country == "Czech Republic", "Czech_Republic", Country))%>%
mutate(Country = ifelse(Country == "New Zealand", "New_Zealand", Country)) %>%
right_join(., temp)
summary_diversity = wheat_per_country %>%
mutate(Area = ifelse(Area == "United States of America", "USA", Area)) %>%
mutate(Area = ifelse(Area == "United Kingdom of Great Britain and Northern Ireland", "UK", Area)) %>%
mutate(Area = ifelse(Area == "Czechia", "Czech_Republic", Area)) %>%
mutate(Area = ifelse(Area == "New Zealand", "New_Zealand", Area)) %>%
mutate(Area = ifelse(Area == "Iran (Islamic Republic of)", "Iran", Area)) %>%
inner_join(., temp, by = c("Area" = "Country"))
p3 = pivot_longer(summary_diversity, cols = c(Arable_land_for_wheat, Land_for_wheat),
names_to = "Wheat_qty_est", values_to = "Percent") %>%
ggplot(aes(x = Pi, y = Percent, label = Area, col = Continent)) +
geom_text() +
theme_bw() +
facet_wrap(vars(Wheat_qty_est), scales = "free_y") +
Color_Continent +
xlim(c(0.004, 0.012))
p1 = filter(summary_diversity, Continent == "Europe") %>%
ggplot(aes(x = Pi, y = Land_for_wheat, label = Area, col = Continent)) +
geom_text() +
theme_bw() +
Color_Continent
p2 = filter(summary_diversity, Continent != "Europe") %>%
ggplot(aes(x = Pi, y = Land_for_wheat, label = Area, col = Continent)) +
geom_text() +
theme_bw() +
Color_Continent
row = cowplot::plot_grid(p1, p2, nrow = 1)
cowplot::plot_grid(p3, row, ncol = 1)
model1 = lm(Pi ~ Land_for_wheat + Continent, data = summary_diversity)
summary(model1)
##
## Call:
## lm(formula = Pi ~ Land_for_wheat + Continent, data = summary_diversity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.869e-03 -2.051e-04 7.367e-05 5.061e-04 1.481e-03
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.155e-02 8.345e-04 13.838 6.03e-10 ***
## Land_for_wheat 1.088e-04 1.017e-04 1.070 0.301376
## ContinentEurope -1.358e-03 9.203e-04 -1.476 0.160715
## ContinentMiddle East 2.215e-05 1.059e-03 0.021 0.983583
## ContinentNorth America -4.427e-03 9.389e-04 -4.715 0.000276 ***
## ContinentOceania -4.549e-03 1.376e-03 -3.305 0.004808 **
## ContinentSouth America -3.776e-03 1.018e-03 -3.711 0.002090 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.001101 on 15 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.8273, Adjusted R-squared: 0.7582
## F-statistic: 11.97 on 6 and 15 DF, p-value: 5.53e-05
model2 = lm(Pi ~ Land_for_wheat + Continent + Land_for_wheat*Continent, data = summary_diversity)
summary(model2)
##
## Call:
## lm(formula = Pi ~ Land_for_wheat + Continent + Land_for_wheat *
## Continent, data = summary_diversity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0016415 -0.0000839 0.0000077 0.0002099 0.0008036
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.189e-02 7.882e-04 15.079 1.08e-08
## Land_for_wheat -5.682e-06 2.141e-04 -0.027 0.979300
## ContinentEurope -1.177e-03 9.502e-04 -1.238 0.241328
## ContinentMiddle East -9.506e-04 1.101e-03 -0.864 0.406187
## ContinentNorth America 1.068e-03 6.020e-03 0.177 0.862384
## ContinentOceania -4.859e-03 1.000e-03 -4.857 0.000505
## ContinentSouth America -9.245e-03 1.325e-03 -6.975 2.34e-05
## Land_for_wheat:ContinentEurope 2.629e-05 2.288e-04 0.115 0.910595
## Land_for_wheat:ContinentMiddle East 2.166e-04 2.393e-04 0.905 0.384833
## Land_for_wheat:ContinentNorth America -4.866e-03 5.093e-03 -0.955 0.359903
## Land_for_wheat:ContinentOceania NA NA NA NA
## Land_for_wheat:ContinentSouth America 3.771e-03 7.398e-04 5.097 0.000346
##
## (Intercept) ***
## Land_for_wheat
## ContinentEurope
## ContinentMiddle East
## ContinentNorth America
## ContinentOceania ***
## ContinentSouth America ***
## Land_for_wheat:ContinentEurope
## Land_for_wheat:ContinentMiddle East
## Land_for_wheat:ContinentNorth America
## Land_for_wheat:ContinentOceania
## Land_for_wheat:ContinentSouth America ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.000666 on 11 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.9537, Adjusted R-squared: 0.9115
## F-statistic: 22.64 on 10 and 11 DF, p-value: 6.426e-06
temp = read_tsv(paste0(Sumstats_dir, "Summary_diversity_pi.tsv"),
col_names = c("Subset_size", "Country", "Year", "Lat", "Long", "Repeat", "Mean_pi", "Median_pi")) %>%
unite(col = "Place", Country, Year, Lat, Long, remove = FALSE) %>%
#mutate(Country = ifelse(str_detect(Country, "^USA"), "USA", Country)) %>%
mutate(Country = ifelse(str_detect(Country, "^Australia"), "Australia", Country))%>%
left_join(Zt_meta %>% dplyr::select(Country, Continent) %>% distinct()) %>%
mutate(Continent = ifelse(Country == "Australia", "Oceania", Continent)) %>%
mutate(Continent = ifelse(Country == "NewZealand", "Oceania", Continent)) %>%
mutate(Continent = ifelse(Country == "USA", "North America", Continent))
temp %>%
ggplot(aes(x = Place, y = Median_pi)) +
geom_point(col = "goldenrod", alpha = .6) +
facet_wrap(vars(Subset_size)) +
coord_flip() + theme_bw()
summary_diversity2 = wheat_per_country %>%
#mutate(Area = ifelse(Area == "United States of America", "USA", Area)) %>%
mutate(Area = ifelse(Area == "United Kingdom of Great Britain and Northern Ireland", "UK", Area)) %>%
mutate(Area = ifelse(Area == "Czechia", "Czech_Republic", Area)) %>%
mutate(Area = ifelse(Area == "New Zealand", "NewZealand", Area)) %>%
mutate(Area = ifelse(Area == "Iran (Islamic Republic of)", "Iran", Area)) %>%
inner_join(., filter(temp, Subset_size == 6) %>% group_by(Country, Continent) %>% summarize(Median_pi = median(Median_pi)),
by = c("Area" = "Country"))
pivot_longer(summary_diversity2, cols = c(Arable_land_for_wheat, Land_for_wheat, Wheat_ha),
names_to = "Wheat_qty_est", values_to = "Land_use") %>%
ggplot(aes(x = Median_pi, y = Land_use, label = Area, col = Continent)) +
geom_text() +
theme_bw() +
facet_wrap(vars(Wheat_qty_est), scales = "free_y") +
Color_Continent
filter(summary_diversity2, Continent == "Europe") %>%
ggplot(aes(x = Median_pi, y = Arable_land_for_wheat, label = Area, col = Continent)) +
geom_text() +
theme_bw() +
Color_Continent
#Read and correct capitals for coordinates per country
wheat_trade = read_csv(paste0(metadata_dir, "FAOSTAT_data_4-9-2021_detailed_trade_matrix.csv"))
correspondence_table = readxl::read_excel(path = paste0(metadata_dir, "FAO_stats_map_corres.xlsx"), sheet = "Summary")
for (i in c(1:nrow(correspondence_table))) {
print(pull(correspondence_table[i, 2]))
print(pull(correspondence_table[i, 1]))
wheat_trade = wheat_trade %>%
mutate_at(vars(contains('Countries')),
funs(str_replace(., pull(correspondence_table[i, 2]), pull(correspondence_table[i, 1]))))
}
## [1] "Bolivia (Plurinational State of)"
## [1] "Bolivia"
## [1] "Brunei Darussalam"
## [1] "Brunei"
## [1] "Czechia"
## [1] "Czech Republic"
## [1] "French Guyana"
## [1] "French Guiana"
## [1] "Iran (Islamic Republic of)"
## [1] "Iran"
## [1] "Côte d'Ivoire"
## [1] "Ivory Coast"
## [1] "North Macedonia"
## [1] "Macedonia"
## [1] "Melanesia"
## [1] "Mayotte"
## [1] "Republic of Moldova"
## [1] "Moldova"
## [1] "Democratic People's Republic of Korea"
## [1] "North Korea"
## [1] "Congo"
## [1] "Republic of Congo"
## [1] "Réunion"
## [1] "Reunion"
## [1] "Russian Federation"
## [1] "Russia"
## [1] "Saint Kitts and Nevis"
## [1] "Saint Kitts"
## [1] "Saint Vincent and the Grenadines"
## [1] "Saint Vincent"
## [1] "Republic of Korea"
## [1] "South Korea"
## [1] "Syrian Arab Republic"
## [1] "Syria"
## [1] "United Republic of Tanzania"
## [1] "Tanzania"
## [1] "Trinidad and Tobago"
## [1] "Trinidad"
## [1] "United Kingdom of Great Britain and Northern Ireland"
## [1] "UK"
## [1] "United States of America"
## [1] "USA"
## [1] "Venezuela (Bolivarian Republic of)"
## [1] "Venezuela"
our_wheat_trade = wheat_trade %>%
filter(Element == "Import Quantity" & Unit == "tonnes") %>%
group_by(`Reporter Countries`, `Partner Countries`) %>%
dplyr::select(`Reporter Countries`, `Partner Countries`, Value) %>%
summarize(Average_value = mean(Value))
#Read and correct capitals for coordinates per country
capital = read_csv(paste0(metadata_dir, "simple_maps_worldcities.csv")) %>%
filter(capital == "primary") %>%
group_by(country) %>%
mutate(rank = rank(population, ties.method = "random")) %>%
filter(rank == 1)
correspondence_table = readxl::read_excel(path = paste0(metadata_dir, "FAO_stats_map_corres.xlsx"), sheet = "Summary_capital")
for (i in c(1:nrow(correspondence_table))) {
print(pull(correspondence_table[i, 2]))
print(pull(correspondence_table[i, 1]))
capital = capital %>%
mutate_at(vars(contains('country')),
funs(str_replace(., pull(correspondence_table[i, 2]), pull(correspondence_table[i, 1]))))
}
## [1] "Antigua And Barbuda"
## [1] "Antigua"
## [1] "Bahamas, The"
## [1] "Bahamas"
## [1] "Bosnia And Herzegovina"
## [1] "Bosnia and Herzegovina"
## [1] "Curaçao"
## [1] "Curacao"
## [1] "Czechia"
## [1] "Czech Republic"
## [1] "Congo (Kinshasa)"
## [1] "Democratic Republic of the Congo"
## [1] "Falkland Islands (Islas Malvinas)"
## [1] "Falkland Islands"
## [1] "Gambia, The"
## [1] "Gambia"
## [1] "Isle Of Man"
## [1] "Isle of Man"
## [1] "Côte D’Ivoire"
## [1] "Ivory Coast"
## [1] "Micronesia, Federated States Of"
## [1] "Micronesia"
## [1] "Congo (Brazzaville)"
## [1] "Republic of Congo"
## [1] "Saint Helena, Ascension, And Tristan Da Cunha"
## [1] "Saint Helena"
## [1] "Saint Kitts And Nevis"
## [1] "Saint Kitts"
## [1] "Saint Vincent And The Grenadines"
## [1] "Saint Vincent"
## [1] "Sao Tome And Principe"
## [1] "Sao Tome and Principe"
## [1] "South Georgia And South Sandwich Islands"
## [1] "South Georgia"
## [1] "Korea, South"
## [1] "South Korea"
## [1] "Trinidad And Tobago"
## [1] "Tobago"
## [1] "Turks And Caicos Islands"
## [1] "Turks and Caicos Islands"
## [1] "United Kingdom"
## [1] "UK"
## [1] "United States"
## [1] "USA"
## [1] "Vatican City"
## [1] "Vatican"
## [1] "Wallis And Futuna"
## [1] "Wallis and Futuna"
temp = inner_join(our_wheat_trade, capital %>%
dplyr::select(country, yend = lat, xend = lng),
by = c("Reporter Countries" = "country")) %>%
inner_join(., capital %>%
dplyr::select(country, y = lat, x = lng),
by = c("Partner Countries" = "country"))
temp2 = filter(temp,`Partner Countries` != `Reporter Countries`) %>%
filter(Average_value > 10000) %>% distinct()
#
ggplot() + theme_void() +
geom_polygon(data = world2,
aes(x=long, y = lat, group = group, fill = Land_for_wheat),
alpha=0.7) +
scale_fill_gradient(low = "#FEEDD8", high = "#faa307", na.value="#F7F4F3") +
geom_curve(data = temp2, aes(x = x, y = y, xend = xend, yend = yend,
size = Average_value, alpha = Average_value),
curvature = 0.2, color = "#432818") +
scale_size_continuous(guide = FALSE, range = c(0.02, 0.5))